!pip install -U pip
!pip install -U setuptools wheel
!pip install -U "mxnet<2.0.0" bokeh==2.0.1
!pip install autogluon --no-cache-dir
# Without --no-cache-dir, smaller aws instances may have trouble installing
Requirement already satisfied: pip in /usr/local/lib/python3.7/site-packages (21.3.1)
Collecting pip
Using cached pip-22.3-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
Attempting uninstall: pip
Found existing installation: pip 21.3.1
Uninstalling pip-21.3.1:
Successfully uninstalled pip-21.3.1
Successfully installed pip-22.3
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Requirement already satisfied: setuptools in /usr/local/lib/python3.7/site-packages (59.4.0)
Collecting setuptools
Using cached setuptools-65.5.0-py3-none-any.whl (1.2 MB)
Collecting wheel
Using cached wheel-0.37.1-py2.py3-none-any.whl (35 kB)
Installing collected packages: wheel, setuptools
Attempting uninstall: setuptools
Found existing installation: setuptools 59.4.0
Uninstalling setuptools-59.4.0:
Successfully uninstalled setuptools-59.4.0
Successfully installed setuptools-65.5.0 wheel-0.37.1
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Collecting mxnet<2.0.0
Using cached mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
Collecting bokeh==2.0.1
Using cached bokeh-2.0.1-py3-none-any.whl
Requirement already satisfied: tornado>=5 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (6.1)
Requirement already satisfied: Jinja2>=2.7 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (3.0.3)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (2.8.2)
Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (4.0.1)
Requirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (1.19.1)
Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (21.3)
Requirement already satisfied: pillow>=4.0 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (8.4.0)
Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.7/site-packages (from bokeh==2.0.1) (5.4.1)
Requirement already satisfied: requests<3,>=2.20.0 in /usr/local/lib/python3.7/site-packages (from mxnet<2.0.0) (2.22.0)
Requirement already satisfied: graphviz<0.9.0,>=0.8.1 in /usr/local/lib/python3.7/site-packages (from mxnet<2.0.0) (0.8.4)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.7/site-packages (from Jinja2>=2.7->bokeh==2.0.1) (2.0.1)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/site-packages (from packaging>=16.8->bokeh==2.0.1) (3.0.6)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil>=2.1->bokeh==2.0.1) (1.16.0)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.7/site-packages (from requests<3,>=2.20.0->mxnet<2.0.0) (2.8)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/site-packages (from requests<3,>=2.20.0->mxnet<2.0.0) (1.25.11)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/site-packages (from requests<3,>=2.20.0->mxnet<2.0.0) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/site-packages (from requests<3,>=2.20.0->mxnet<2.0.0) (2021.10.8)
Installing collected packages: mxnet, bokeh
Attempting uninstall: bokeh
Found existing installation: bokeh 2.4.2
Uninstalling bokeh-2.4.2:
Successfully uninstalled bokeh-2.4.2
Successfully installed bokeh-2.0.1 mxnet-1.9.1
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Collecting autogluon
Downloading autogluon-0.5.2-py3-none-any.whl (9.6 kB)
Collecting autogluon.vision==0.5.2
Downloading autogluon.vision-0.5.2-py3-none-any.whl (48 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 48.8/48.8 kB 94.1 MB/s eta 0:00:00
Collecting autogluon.text==0.5.2
Downloading autogluon.text-0.5.2-py3-none-any.whl (61 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.9/61.9 kB 165.5 MB/s eta 0:00:00
Collecting autogluon.timeseries[all]==0.5.2
Downloading autogluon.timeseries-0.5.2-py3-none-any.whl (65 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65.4/65.4 kB 172.5 MB/s eta 0:00:00
Collecting autogluon.core[all]==0.5.2
Downloading autogluon.core-0.5.2-py3-none-any.whl (210 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 210.4/210.4 kB 162.7 MB/s eta 0:00:00
Collecting autogluon.features==0.5.2
Downloading autogluon.features-0.5.2-py3-none-any.whl (59 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 59.4/59.4 kB 116.4 MB/s eta 0:00:00
Collecting autogluon.tabular[all]==0.5.2
Downloading autogluon.tabular-0.5.2-py3-none-any.whl (274 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 274.2/274.2 kB 222.4 MB/s eta 0:00:00
Collecting autogluon.multimodal==0.5.2
Downloading autogluon.multimodal-0.5.2-py3-none-any.whl (149 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 149.4/149.4 kB 197.8 MB/s eta 0:00:00
Requirement already satisfied: pandas!=1.4.0,<1.5,>=1.2.5 in /usr/local/lib/python3.7/site-packages (from autogluon.core[all]==0.5.2->autogluon) (1.3.4)
Collecting autogluon.common==0.5.2
Downloading autogluon.common-0.5.2-py3-none-any.whl (37 kB)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/site-packages (from autogluon.core[all]==0.5.2->autogluon) (3.5.0)
Collecting scipy<1.8.0,>=1.5.4
Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 38.1/38.1 MB 143.6 MB/s eta 0:00:00a 0:00:01
Requirement already satisfied: boto3 in /usr/local/lib/python3.7/site-packages (from autogluon.core[all]==0.5.2->autogluon) (1.20.17)
Requirement already satisfied: requests in /usr/local/lib/python3.7/site-packages (from autogluon.core[all]==0.5.2->autogluon) (2.22.0)
Collecting numpy<1.23,>=1.21
Downloading numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15.7/15.7 MB 157.7 MB/s eta 0:00:00a 0:00:01
Collecting dask<=2021.11.2,>=2021.09.1
Downloading dask-2021.11.2-py3-none-any.whl (1.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 162.9 MB/s eta 0:00:00
Collecting distributed<=2021.11.2,>=2021.09.1
Downloading distributed-2021.11.2-py3-none-any.whl (802 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 802.2/802.2 kB 235.9 MB/s eta 0:00:00
Requirement already satisfied: tqdm>=4.38.0 in /usr/local/lib/python3.7/site-packages (from autogluon.core[all]==0.5.2->autogluon) (4.39.0)
Requirement already satisfied: scikit-learn<1.1,>=1.0.0 in /usr/local/lib/python3.7/site-packages (from autogluon.core[all]==0.5.2->autogluon) (1.0.1)
Collecting ray<1.14,>=1.13
Downloading ray-1.13.0-cp37-cp37m-manylinux2014_x86_64.whl (54.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 54.5/54.5 MB 173.7 MB/s eta 0:00:00a 0:00:01
Collecting hyperopt<0.2.8,>=0.2.7
Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 218.7 MB/s eta 0:00:00
Requirement already satisfied: psutil<6,>=5.7.3 in /usr/local/lib/python3.7/site-packages (from autogluon.features==0.5.2->autogluon) (5.8.0)
Collecting torch<1.13,>=1.9
Downloading torch-1.12.1-cp37-cp37m-manylinux1_x86_64.whl (776.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 776.3/776.3 MB 162.6 MB/s eta 0:00:0000:0100:01
Collecting torchtext<0.14.0
Downloading torchtext-0.13.1-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 222.7 MB/s eta 0:00:00
Collecting nlpaug<=1.1.10,>=1.1.10
Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.8/410.8 kB 165.2 MB/s eta 0:00:00
Collecting Pillow<9.1.0,>=9.0.1
Downloading Pillow-9.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.3/4.3 MB 189.3 MB/s eta 0:00:00
Collecting omegaconf<2.2.0,>=2.1.1
Downloading omegaconf-2.1.2-py3-none-any.whl (74 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 74.7/74.7 kB 174.8 MB/s eta 0:00:00
Collecting pytorch-metric-learning<1.4.0,>=1.3.0
Downloading pytorch_metric_learning-1.3.2-py3-none-any.whl (109 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 109.4/109.4 kB 199.6 MB/s eta 0:00:00
Collecting torchmetrics<0.8.0,>=0.7.2
Downloading torchmetrics-0.7.3-py3-none-any.whl (398 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 398.2/398.2 kB 240.1 MB/s eta 0:00:00
Collecting pytorch-lightning<1.7.0,>=1.6.0
Downloading pytorch_lightning-1.6.5-py3-none-any.whl (585 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 585.9/585.9 kB 239.6 MB/s eta 0:00:00
Collecting scikit-image<0.20.0,>=0.19.1
Downloading scikit_image-0.19.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (13.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.5/13.5 MB 163.6 MB/s eta 0:00:00a 0:00:01
Collecting smart-open<5.3.0,>=5.2.1
Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.6/58.6 kB 160.7 MB/s eta 0:00:00
Collecting nptyping<1.5.0,>=1.4.4
Downloading nptyping-1.4.4-py3-none-any.whl (31 kB)
Collecting torchvision<0.14.0
Downloading torchvision-0.13.1-cp37-cp37m-manylinux1_x86_64.whl (19.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 19.1/19.1 MB 170.1 MB/s eta 0:00:00a 0:00:01
Collecting timm<0.6.0
Downloading timm-0.5.4-py3-none-any.whl (431 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 431.5/431.5 kB 217.5 MB/s eta 0:00:00
Collecting protobuf<=3.18.1
Downloading protobuf-3.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.1/1.1 MB 245.4 MB/s eta 0:00:00
Collecting nltk<4.0.0,>=3.4.5
Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 240.7 MB/s eta 0:00:00
Collecting transformers<4.21.0,>=4.18.0
Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.4/4.4 MB 192.2 MB/s eta 0:00:00
Collecting sentencepiece<0.2.0,>=0.1.95
Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 246.0 MB/s eta 0:00:00
Collecting fairscale<=0.4.6,>=0.4.5
Downloading fairscale-0.4.6.tar.gz (248 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 248.2/248.2 kB 224.4 MB/s eta 0:00:00
Installing build dependencies ... done
Getting requirements to build wheel ... done
Installing backend dependencies ... done
Preparing metadata (pyproject.toml) ... done
Requirement already satisfied: networkx<3.0,>=2.3 in /usr/local/lib/python3.7/site-packages (from autogluon.tabular[all]==0.5.2->autogluon) (2.6.3)
Collecting fastai<2.8,>=2.3.1
Downloading fastai-2.7.9-py3-none-any.whl (225 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 225.5/225.5 kB 188.1 MB/s eta 0:00:00
Collecting catboost<1.1,>=1.0
Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 76.6/76.6 MB 177.6 MB/s eta 0:00:00a 0:00:01
Collecting lightgbm<3.4,>=3.3
Downloading lightgbm-3.3.3-py3-none-manylinux1_x86_64.whl (2.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 193.7 MB/s eta 0:00:00
Collecting xgboost<1.5,>=1.4
Downloading xgboost-1.4.2-py3-none-manylinux2010_x86_64.whl (166.7 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.7/166.7 MB 153.1 MB/s eta 0:00:0000:0100:01
Collecting autogluon-contrib-nlp==0.0.1b20220208
Downloading autogluon_contrib_nlp-0.0.1b20220208-py3-none-any.whl (157 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 157.3/157.3 kB 208.4 MB/s eta 0:00:00
Collecting gluonts<0.10.0,>=0.8.0
Downloading gluonts-0.9.9-py3-none-any.whl (2.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.8/2.8 MB 216.3 MB/s eta 0:00:00
Collecting sktime~=0.11.4
Downloading sktime-0.11.4-py3-none-any.whl (6.7 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.7/6.7 MB 155.1 MB/s eta 0:00:00a 0:00:01
Collecting tbats~=1.1
Downloading tbats-1.1.1-py3-none-any.whl (43 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.8/43.8 kB 143.8 MB/s eta 0:00:00
Collecting pmdarima~=1.8.2
Downloading pmdarima-1.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 169.3 MB/s eta 0:00:00
Collecting gluoncv<0.10.6,>=0.10.5
Downloading gluoncv-0.10.5.post0-py2.py3-none-any.whl (1.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 242.2 MB/s eta 0:00:00
Collecting tokenizers>=0.9.4
Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.6/7.6 MB 125.5 MB/s eta 0:00:00a 0:00:01
Collecting regex
Downloading regex-2022.9.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 757.0/757.0 kB 212.4 MB/s eta 0:00:00
Collecting flake8
Downloading flake8-5.0.4-py2.py3-none-any.whl (61 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.9/61.9 kB 147.8 MB/s eta 0:00:00
Collecting sacrebleu
Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.9/118.9 kB 200.1 MB/s eta 0:00:00
Collecting sentencepiece<0.2.0,>=0.1.95
Downloading sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 235.7 MB/s eta 0:00:00
Collecting yacs>=0.1.6
Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Requirement already satisfied: pyarrow in /usr/local/lib/python3.7/site-packages (from autogluon-contrib-nlp==0.0.1b20220208->autogluon.text==0.5.2->autogluon) (6.0.1)
Collecting contextvars
Downloading contextvars-2.4.tar.gz (9.6 kB)
Preparing metadata (setup.py) ... done
Collecting sacremoses>=0.0.38
Downloading sacremoses-0.0.53.tar.gz (880 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 880.6/880.6 kB 239.9 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Requirement already satisfied: plotly in /usr/local/lib/python3.7/site-packages (from catboost<1.1,>=1.0->autogluon.tabular[all]==0.5.2->autogluon) (5.4.0)
Requirement already satisfied: six in /usr/local/lib/python3.7/site-packages (from catboost<1.1,>=1.0->autogluon.tabular[all]==0.5.2->autogluon) (1.16.0)
Requirement already satisfied: graphviz in /usr/local/lib/python3.7/site-packages (from catboost<1.1,>=1.0->autogluon.tabular[all]==0.5.2->autogluon) (0.8.4)
Collecting toolz>=0.8.2
Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.8/55.8 kB 177.6 MB/s eta 0:00:00
Collecting partd>=0.3.10
Downloading partd-1.3.0-py3-none-any.whl (18 kB)
Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/site-packages (from dask<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (5.4.1)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/site-packages (from dask<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (21.3)
Requirement already satisfied: cloudpickle>=1.1.1 in /usr/local/lib/python3.7/site-packages (from dask<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (2.0.0)
Requirement already satisfied: fsspec>=0.6.0 in /usr/local/lib/python3.7/site-packages (from dask<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (2021.11.1)
Collecting click>=6.6
Downloading click-8.1.3-py3-none-any.whl (96 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 96.6/96.6 kB 194.3 MB/s eta 0:00:00
Collecting tblib>=1.6.0
Downloading tblib-1.7.0-py2.py3-none-any.whl (12 kB)
Collecting sortedcontainers!=2.0.0,!=2.0.1
Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Requirement already satisfied: setuptools in /usr/local/lib/python3.7/site-packages (from distributed<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (65.5.0)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/site-packages (from distributed<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (3.0.3)
Requirement already satisfied: tornado>=5 in /usr/local/lib/python3.7/site-packages (from distributed<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (6.1)
Collecting msgpack>=0.6.0
Downloading msgpack-1.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 299.8/299.8 kB 215.5 MB/s eta 0:00:00
Collecting zict>=0.1.3
Downloading zict-2.2.0-py2.py3-none-any.whl (23 kB)
Collecting spacy<4
Downloading spacy-3.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 201.4 MB/s eta 0:00:00
Collecting fastprogress>=0.2.4
Downloading fastprogress-1.0.3-py3-none-any.whl (12 kB)
Collecting fastcore<1.6,>=1.4.5
Downloading fastcore-1.5.27-py3-none-any.whl (67 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 67.1/67.1 kB 155.9 MB/s eta 0:00:00
Collecting fastdownload<2,>=0.0.5
Downloading fastdownload-0.0.7-py3-none-any.whl (12 kB)
Requirement already satisfied: pip in /usr/local/lib/python3.7/site-packages (from fastai<2.8,>=2.3.1->autogluon.tabular[all]==0.5.2->autogluon) (22.3)
Requirement already satisfied: portalocker in /usr/local/lib/python3.7/site-packages (from gluoncv<0.10.6,>=0.10.5->autogluon.vision==0.5.2->autogluon) (2.3.2)
Requirement already satisfied: opencv-python in /usr/local/lib/python3.7/site-packages (from gluoncv<0.10.6,>=0.10.5->autogluon.vision==0.5.2->autogluon) (4.5.4.60)
Collecting autocfg
Downloading autocfg-0.0.8-py3-none-any.whl (13 kB)
Collecting pydantic~=1.1
Downloading pydantic-1.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.8/11.8 MB 169.3 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.7/site-packages (from gluonts<0.10.0,>=0.8.0->autogluon.timeseries[all]==0.5.2->autogluon) (4.0.1)
Collecting holidays>=0.9
Downloading holidays-0.16-py3-none-any.whl (184 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 184.6/184.6 kB 212.0 MB/s eta 0:00:00
Collecting future
Downloading future-0.18.2.tar.gz (829 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 829.2/829.2 kB 242.6 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Collecting py4j
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 200.5/200.5 kB 217.7 MB/s eta 0:00:00
Requirement already satisfied: wheel in /usr/local/lib/python3.7/site-packages (from lightgbm<3.4,>=3.3->autogluon.tabular[all]==0.5.2->autogluon) (0.37.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/site-packages (from matplotlib->autogluon.core[all]==0.5.2->autogluon) (0.11.0)
Requirement already satisfied: setuptools-scm>=4 in /usr/local/lib/python3.7/site-packages (from matplotlib->autogluon.core[all]==0.5.2->autogluon) (6.3.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.7/site-packages (from matplotlib->autogluon.core[all]==0.5.2->autogluon) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.7/site-packages (from matplotlib->autogluon.core[all]==0.5.2->autogluon) (3.0.6)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/site-packages (from matplotlib->autogluon.core[all]==0.5.2->autogluon) (1.3.2)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.7/site-packages (from matplotlib->autogluon.core[all]==0.5.2->autogluon) (4.28.2)
Requirement already satisfied: joblib in /usr/local/lib/python3.7/site-packages (from nltk<4.0.0,>=3.4.5->autogluon.multimodal==0.5.2->autogluon) (1.1.0)
Collecting typish>=1.7.0
Downloading typish-1.9.3-py3-none-any.whl (45 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 45.1/45.1 kB 143.8 MB/s eta 0:00:00
Collecting antlr4-python3-runtime==4.8
Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 112.4/112.4 kB 197.2 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/site-packages (from pandas!=1.4.0,<1.5,>=1.2.5->autogluon.core[all]==0.5.2->autogluon) (2021.3)
Collecting statsmodels!=0.12.0,>=0.11
Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.8/9.8 MB 180.9 MB/s eta 0:00:00a 0:00:01
Requirement already satisfied: Cython!=0.29.18,>=0.29 in /usr/local/lib/python3.7/site-packages (from pmdarima~=1.8.2->autogluon.timeseries[all]==0.5.2->autogluon) (0.29.24)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/site-packages (from pmdarima~=1.8.2->autogluon.timeseries[all]==0.5.2->autogluon) (1.25.11)
Collecting tqdm>=4.38.0
Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 kB 186.5 MB/s eta 0:00:00
Collecting tensorboard>=2.2.0
Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.9/5.9 MB 199.0 MB/s eta 0:00:00
Collecting pyDeprecate>=0.3.1
Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Collecting filelock
Downloading filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting virtualenv
Downloading virtualenv-20.16.5-py3-none-any.whl (8.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.8/8.8 MB 182.7 MB/s eta 0:00:00a 0:00:01
Requirement already satisfied: attrs in /usr/local/lib/python3.7/site-packages (from ray<1.14,>=1.13->autogluon.core[all]==0.5.2->autogluon) (21.2.0)
Collecting click>=6.6
Downloading click-8.0.4-py3-none-any.whl (97 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.5/97.5 kB 190.4 MB/s eta 0:00:00
Collecting aiosignal
Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting frozenlist
Downloading frozenlist-1.3.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (148 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 148.0/148.0 kB 177.2 MB/s eta 0:00:00
Collecting grpcio<=1.43.0,>=1.28.1
Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.1/4.1 MB 210.7 MB/s eta 0:00:00
Collecting jsonschema
Downloading jsonschema-4.16.0-py3-none-any.whl (83 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 83.1/83.1 kB 175.7 MB/s eta 0:00:00
Collecting tensorboardX>=1.9
Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 125.4/125.4 kB 198.8 MB/s eta 0:00:00
Requirement already satisfied: tabulate in /usr/local/lib/python3.7/site-packages (from ray<1.14,>=1.13->autogluon.core[all]==0.5.2->autogluon) (0.8.9)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/site-packages (from requests->autogluon.core[all]==0.5.2->autogluon) (2021.10.8)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.7/site-packages (from requests->autogluon.core[all]==0.5.2->autogluon) (2.8)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/site-packages (from requests->autogluon.core[all]==0.5.2->autogluon) (3.0.4)
Collecting PyWavelets>=1.1.1
Downloading PyWavelets-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 161.6 MB/s eta 0:00:00
Collecting tifffile>=2019.7.26
Downloading tifffile-2021.11.2-py3-none-any.whl (178 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 178.9/178.9 kB 196.4 MB/s eta 0:00:00
Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.7/site-packages (from scikit-image<0.20.0,>=0.19.1->autogluon.multimodal==0.5.2->autogluon) (2.13.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/site-packages (from scikit-learn<1.1,>=1.0.0->autogluon.core[all]==0.5.2->autogluon) (3.0.0)
Collecting deprecated>=1.2.13
Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Requirement already satisfied: numba>=0.53 in /usr/local/lib/python3.7/site-packages (from sktime~=0.11.4->autogluon.timeseries[all]==0.5.2->autogluon) (0.53.1)
Collecting huggingface-hub<1.0,>=0.1.0
Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 163.5/163.5 kB 129.6 MB/s eta 0:00:00
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/site-packages (from transformers<4.21.0,>=4.18.0->autogluon.multimodal==0.5.2->autogluon) (4.8.2)
Collecting tokenizers>=0.9.4
Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 209.7 MB/s eta 0:00:00
Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /usr/local/lib/python3.7/site-packages (from boto3->autogluon.core[all]==0.5.2->autogluon) (0.5.0)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.7/site-packages (from boto3->autogluon.core[all]==0.5.2->autogluon) (0.10.0)
Requirement already satisfied: botocore<1.24.0,>=1.23.17 in /usr/local/lib/python3.7/site-packages (from boto3->autogluon.core[all]==0.5.2->autogluon) (1.23.17)
Collecting wrapt<2,>=1.10
Downloading wrapt-1.14.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (75 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 75.2/75.2 kB 180.6 MB/s eta 0:00:00
Collecting aiohttp
Downloading aiohttp-3.8.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (948 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 948.0/948.0 kB 235.1 MB/s eta 0:00:00
Collecting hijri-converter
Downloading hijri_converter-2.2.4-py3-none-any.whl (14 kB)
Collecting convertdate>=2.3.0
Downloading convertdate-2.4.0-py3-none-any.whl (47 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 47.9/47.9 kB 146.7 MB/s eta 0:00:00
Collecting korean-lunar-calendar
Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl (9.0 kB)
Requirement already satisfied: llvmlite<0.37,>=0.36.0rc1 in /usr/local/lib/python3.7/site-packages (from numba>=0.53->sktime~=0.11.4->autogluon.timeseries[all]==0.5.2->autogluon) (0.36.0)
Collecting locket
Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Collecting typing-extensions~=4.0
Downloading typing_extensions-4.4.0-py3-none-any.whl (26 kB)
Requirement already satisfied: tomli>=1.0.0 in /usr/local/lib/python3.7/site-packages (from setuptools-scm>=4->matplotlib->autogluon.core[all]==0.5.2->autogluon) (1.2.2)
Collecting typer<0.5.0,>=0.3.0
Downloading typer-0.4.2-py3-none-any.whl (27 kB)
Collecting catalogue<2.1.0,>=2.0.6
Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting spacy-legacy<3.1.0,>=3.0.10
Downloading spacy_legacy-3.0.10-py2.py3-none-any.whl (21 kB)
Collecting srsly<3.0.0,>=2.4.3
Downloading srsly-2.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (490 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 490.0/490.0 kB 241.8 MB/s eta 0:00:00
Collecting preshed<3.1.0,>=3.0.2
Downloading preshed-3.0.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (126 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.6/126.6 kB 198.2 MB/s eta 0:00:00
Collecting cymem<2.1.0,>=2.0.2
Downloading cymem-2.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36 kB)
Collecting murmurhash<1.1.0,>=0.28.0
Downloading murmurhash-1.0.9-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting wasabi<1.1.0,>=0.9.1
Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting langcodes<4.0.0,>=3.2.0
Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 181.6/181.6 kB 220.2 MB/s eta 0:00:00
Collecting typing-extensions~=4.0
Downloading typing_extensions-4.1.1-py3-none-any.whl (26 kB)
Collecting thinc<8.2.0,>=8.1.0
Downloading thinc-8.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (806 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 806.2/806.2 kB 235.7 MB/s eta 0:00:00
Collecting pathy>=0.3.5
Downloading pathy-0.6.2-py3-none-any.whl (42 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 kB 84.3 MB/s eta 0:00:00
Collecting spacy-loggers<2.0.0,>=1.0.0
Downloading spacy_loggers-1.0.3-py3-none-any.whl (9.3 kB)
Collecting patsy>=0.5.2
Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.8/233.8 kB 99.4 MB/s eta 0:00:00
Collecting google-auth-oauthlib<0.5,>=0.4.1
Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0
Downloading tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.9/4.9 MB 199.3 MB/s eta 0:00:00
Collecting absl-py>=0.4
Downloading absl_py-1.3.0-py3-none-any.whl (124 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.6/124.6 kB 125.5 MB/s eta 0:00:00
Collecting tensorboard-plugin-wit>=1.6.0
Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl (781 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 781.3/781.3 kB 247.2 MB/s eta 0:00:00
Collecting google-auth<3,>=1.6.3
Downloading google_auth-2.13.0-py2.py3-none-any.whl (174 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 174.5/174.5 kB 200.5 MB/s eta 0:00:00
Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.7/site-packages (from tensorboard>=2.2.0->pytorch-lightning<1.7.0,>=1.6.0->autogluon.multimodal==0.5.2->autogluon) (2.0.2)
Collecting markdown>=2.6.8
Downloading Markdown-3.4.1-py3-none-any.whl (93 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 93.3/93.3 kB 190.2 MB/s eta 0:00:00
Collecting heapdict
Downloading HeapDict-1.0.1-py3-none-any.whl (3.9 kB)
Collecting immutables>=0.9
Downloading immutables-0.19-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (117 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 117.0/117.0 kB 191.6 MB/s eta 0:00:00
Collecting mccabe<0.8.0,>=0.7.0
Downloading mccabe-0.7.0-py2.py3-none-any.whl (7.3 kB)
Collecting pyflakes<2.6.0,>=2.5.0
Downloading pyflakes-2.5.0-py2.py3-none-any.whl (66 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.1/66.1 kB 176.4 MB/s eta 0:00:00
Collecting importlib-metadata
Downloading importlib_metadata-4.2.0-py3-none-any.whl (16 kB)
Collecting pycodestyle<2.10.0,>=2.9.0
Downloading pycodestyle-2.9.1-py2.py3-none-any.whl (41 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.5/41.5 kB 142.3 MB/s eta 0:00:00
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/site-packages (from importlib-metadata->transformers<4.21.0,>=4.18.0->autogluon.multimodal==0.5.2->autogluon) (3.6.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.7/site-packages (from jinja2->distributed<=2021.11.2,>=2021.09.1->autogluon.core[all]==0.5.2->autogluon) (2.0.1)
Collecting pkgutil-resolve-name>=1.3.10
Downloading pkgutil_resolve_name-1.3.10-py3-none-any.whl (4.7 kB)
Collecting pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0
Downloading pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (117 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 117.1/117.1 kB 201.4 MB/s eta 0:00:00
Collecting importlib-resources>=1.4.0
Downloading importlib_resources-5.10.0-py3-none-any.whl (34 kB)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/site-packages (from plotly->catboost<1.1,>=1.0->autogluon.tabular[all]==0.5.2->autogluon) (8.0.1)
Requirement already satisfied: colorama in /usr/local/lib/python3.7/site-packages (from sacrebleu->autogluon-contrib-nlp==0.0.1b20220208->autogluon.text==0.5.2->autogluon) (0.4.3)
Collecting lxml
Downloading lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 191.2 MB/s eta 0:00:00
Collecting distlib<1,>=0.3.5
Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 468.5/468.5 kB 226.6 MB/s eta 0:00:00
Collecting virtualenv
Downloading virtualenv-20.16.4-py3-none-any.whl (8.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.8/8.8 MB 180.4 MB/s eta 0:00:00a 0:00:01
Downloading virtualenv-20.16.3-py2.py3-none-any.whl (8.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.8/8.8 MB 168.1 MB/s eta 0:00:00a 0:00:01
Downloading virtualenv-20.16.2-py2.py3-none-any.whl (8.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.8/8.8 MB 177.7 MB/s eta 0:00:00a 0:00:01
Collecting platformdirs<3,>=2
Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Collecting pymeeus<=1,>=0.3.13
Downloading PyMeeus-0.5.11.tar.gz (5.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.4/5.4 MB 199.7 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Collecting cachetools<6.0,>=2.0.0
Downloading cachetools-5.2.0-py3-none-any.whl (9.3 kB)
Collecting pyasn1-modules>=0.2.1
Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 155.3/155.3 kB 215.0 MB/s eta 0:00:00
Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard>=2.2.0->pytorch-lightning<1.7.0,>=1.6.0->autogluon.multimodal==0.5.2->autogluon) (4.7.2)
Collecting requests-oauthlib>=0.7.0
Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Collecting markdown>=2.6.8
Downloading Markdown-3.4-py3-none-any.whl (93 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 93.3/93.3 kB 165.3 MB/s eta 0:00:00
Downloading Markdown-3.3.7-py3-none-any.whl (97 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.8/97.8 kB 197.0 MB/s eta 0:00:00
Downloading Markdown-3.3.6-py3-none-any.whl (97 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.8/97.8 kB 179.7 MB/s eta 0:00:00
Downloading Markdown-3.3.4-py3-none-any.whl (97 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.6/97.6 kB 199.0 MB/s eta 0:00:00
Collecting confection<1.0.0,>=0.0.1
Downloading confection-0.0.3-py3-none-any.whl (32 kB)
Collecting blis<0.8.0,>=0.7.8
Downloading blis-0.7.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.2 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.2/10.2 MB 169.5 MB/s eta 0:00:00a 0:00:01
Collecting yarl<2.0,>=1.0
Downloading yarl-1.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (231 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 231.3/231.3 kB 209.6 MB/s eta 0:00:00
Collecting charset-normalizer<3.0,>=2.0
Downloading charset_normalizer-2.1.1-py3-none-any.whl (39 kB)
Collecting async-timeout<5.0,>=4.0.0a3
Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting asynctest==0.13.0
Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
Collecting multidict<7.0,>=4.5
Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 94.8/94.8 kB 193.8 MB/s eta 0:00:00
Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard>=2.2.0->pytorch-lightning<1.7.0,>=1.6.0->autogluon.multimodal==0.5.2->autogluon) (0.4.8)
Collecting oauthlib>=3.0.0
Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 151.7/151.7 kB 197.6 MB/s eta 0:00:00
Building wheels for collected packages: fairscale, antlr4-python3-runtime, sacremoses, contextvars, future, pymeeus
Building wheel for fairscale (pyproject.toml) ... done
Created wheel for fairscale: filename=fairscale-0.4.6-py3-none-any.whl size=307225 sha256=408abe181e5c8c6455d2988de3249af08ce115c887cb598b06899c544eeee679
Stored in directory: /tmp/pip-ephem-wheel-cache-7x0bj8hf/wheels/0b/8c/fa/a9e102632bcb86e919561cf25ca1e0dd2ec67476f3a5544653
Building wheel for antlr4-python3-runtime (setup.py) ... done
Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141210 sha256=47cf9d6e57166ca87a031f74fd2d2a4996bc987dad6a3215f3feb78c5dd9cb7a
Stored in directory: /tmp/pip-ephem-wheel-cache-7x0bj8hf/wheels/c9/ef/75/1b8c6588a8a8a15d5a9136608a9d65172a226577e7ae89da31
Building wheel for sacremoses (setup.py) ... done
Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895241 sha256=d9f98d6f3f0cc0f4475837e74cb857f4584b2bdc668878b18aa72d206cdd4d19
Stored in directory: /tmp/pip-ephem-wheel-cache-7x0bj8hf/wheels/5b/e0/77/05245143a5b31f65af6a21f7afd3219e9fa4896f918af45677
Building wheel for contextvars (setup.py) ... done
Created wheel for contextvars: filename=contextvars-2.4-py3-none-any.whl size=7664 sha256=e996afc200da92862c5d649bbc379bd2c23a7c0b7c9ebee59ad7189424f4d4c4
Stored in directory: /tmp/pip-ephem-wheel-cache-7x0bj8hf/wheels/1b/4f/f6/2cf0b56beceeb4a516c29f1a061522603b2db256b1c9930fee
Building wheel for future (setup.py) ... done
Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491058 sha256=7cbb41c06213a4c7030f52d619f7e708e94a62fca810bf390368bb36cb30e4ed
Stored in directory: /tmp/pip-ephem-wheel-cache-7x0bj8hf/wheels/3e/3c/b4/7132d27620dd551cf00823f798a7190e7320ae7ffb71d1e989
Building wheel for pymeeus (setup.py) ... done
Created wheel for pymeeus: filename=PyMeeus-0.5.11-py3-none-any.whl size=730971 sha256=6a0982bec2eab55e01b18da96f42b6bc944c844116818d5a636cd21b8ba2eefa
Stored in directory: /tmp/pip-ephem-wheel-cache-7x0bj8hf/wheels/bc/17/d4/0095e29d942940d5653b55f8503c4940e1fad226352c98c0d8
Successfully built fairscale antlr4-python3-runtime sacremoses contextvars future pymeeus
Installing collected packages: wasabi, typish, tokenizers, tensorboard-plugin-wit, sortedcontainers, sentencepiece, pymeeus, py4j, msgpack, korean-lunar-calendar, heapdict, distlib, cymem, antlr4-python3-runtime, zict, yacs, wrapt, typing-extensions, tqdm, toolz, tensorboard-data-server, tblib, spacy-loggers, spacy-legacy, smart-open, regex, pyrsistent, pyflakes, pyDeprecate, pycodestyle, pyasn1-modules, protobuf, platformdirs, pkgutil-resolve-name, Pillow, omegaconf, oauthlib, numpy, murmurhash, multidict, mccabe, lxml, locket, langcodes, importlib-resources, hijri-converter, grpcio, future, frozenlist, filelock, fastprogress, convertdate, charset-normalizer, cachetools, autocfg, asynctest, absl-py, yarl, torch, tifffile, tensorboardX, scipy, sacrebleu, requests-oauthlib, PyWavelets, pydantic, preshed, patsy, partd, nptyping, importlib-metadata, immutables, holidays, google-auth, fastcore, deprecated, catalogue, blis, async-timeout, aiosignal, xgboost, virtualenv, torchvision, torchtext, torchmetrics, statsmodels, srsly, scikit-image, nlpaug, markdown, jsonschema, hyperopt, huggingface-hub, google-auth-oauthlib, flake8, fastdownload, fairscale, dask, contextvars, click, aiohttp, typer, transformers, timm, tensorboard, sktime, sacremoses, ray, pytorch-metric-learning, pmdarima, nltk, lightgbm, gluonts, gluoncv, distributed, confection, catboost, thinc, tbats, pytorch-lightning, pathy, autogluon-contrib-nlp, autogluon.common, spacy, autogluon.features, autogluon.core, fastai, autogluon.vision, autogluon.timeseries, autogluon.tabular, autogluon.multimodal, autogluon.text, autogluon
Attempting uninstall: typing-extensions
Found existing installation: typing_extensions 4.0.1
Uninstalling typing_extensions-4.0.1:
Successfully uninstalled typing_extensions-4.0.1
Attempting uninstall: tqdm
Found existing installation: tqdm 4.39.0
Uninstalling tqdm-4.39.0:
Successfully uninstalled tqdm-4.39.0
Attempting uninstall: protobuf
Found existing installation: protobuf 3.19.1
Uninstalling protobuf-3.19.1:
Successfully uninstalled protobuf-3.19.1
Attempting uninstall: Pillow
Found existing installation: Pillow 8.4.0
Uninstalling Pillow-8.4.0:
Successfully uninstalled Pillow-8.4.0
Attempting uninstall: numpy
Found existing installation: numpy 1.19.1
Uninstalling numpy-1.19.1:
Successfully uninstalled numpy-1.19.1
Attempting uninstall: scipy
Found existing installation: scipy 1.4.1
Uninstalling scipy-1.4.1:
Successfully uninstalled scipy-1.4.1
Attempting uninstall: importlib-metadata
Found existing installation: importlib-metadata 4.8.2
Uninstalling importlib-metadata-4.8.2:
Successfully uninstalled importlib-metadata-4.8.2
Attempting uninstall: gluoncv
Found existing installation: gluoncv 0.8.0
Uninstalling gluoncv-0.8.0:
Successfully uninstalled gluoncv-0.8.0
Successfully installed Pillow-9.0.1 PyWavelets-1.3.0 absl-py-1.3.0 aiohttp-3.8.3 aiosignal-1.2.0 antlr4-python3-runtime-4.8 async-timeout-4.0.2 asynctest-0.13.0 autocfg-0.0.8 autogluon-0.5.2 autogluon-contrib-nlp-0.0.1b20220208 autogluon.common-0.5.2 autogluon.core-0.5.2 autogluon.features-0.5.2 autogluon.multimodal-0.5.2 autogluon.tabular-0.5.2 autogluon.text-0.5.2 autogluon.timeseries-0.5.2 autogluon.vision-0.5.2 blis-0.7.9 cachetools-5.2.0 catalogue-2.0.8 catboost-1.0.6 charset-normalizer-2.1.1 click-8.0.4 confection-0.0.3 contextvars-2.4 convertdate-2.4.0 cymem-2.0.7 dask-2021.11.2 deprecated-1.2.13 distlib-0.3.6 distributed-2021.11.2 fairscale-0.4.6 fastai-2.7.9 fastcore-1.5.27 fastdownload-0.0.7 fastprogress-1.0.3 filelock-3.8.0 flake8-5.0.4 frozenlist-1.3.1 future-0.18.2 gluoncv-0.10.5.post0 gluonts-0.9.9 google-auth-2.13.0 google-auth-oauthlib-0.4.6 grpcio-1.43.0 heapdict-1.0.1 hijri-converter-2.2.4 holidays-0.16 huggingface-hub-0.10.1 hyperopt-0.2.7 immutables-0.19 importlib-metadata-4.2.0 importlib-resources-5.10.0 jsonschema-4.16.0 korean-lunar-calendar-0.3.1 langcodes-3.3.0 lightgbm-3.3.3 locket-1.0.0 lxml-4.9.1 markdown-3.3.4 mccabe-0.7.0 msgpack-1.0.4 multidict-6.0.2 murmurhash-1.0.9 nlpaug-1.1.10 nltk-3.7 nptyping-1.4.4 numpy-1.21.6 oauthlib-3.2.2 omegaconf-2.1.2 partd-1.3.0 pathy-0.6.2 patsy-0.5.3 pkgutil-resolve-name-1.3.10 platformdirs-2.5.2 pmdarima-1.8.5 preshed-3.0.8 protobuf-3.18.1 py4j-0.10.9.7 pyDeprecate-0.3.2 pyasn1-modules-0.2.8 pycodestyle-2.9.1 pydantic-1.10.2 pyflakes-2.5.0 pymeeus-0.5.11 pyrsistent-0.18.1 pytorch-lightning-1.6.5 pytorch-metric-learning-1.3.2 ray-1.13.0 regex-2022.9.13 requests-oauthlib-1.3.1 sacrebleu-2.3.1 sacremoses-0.0.53 scikit-image-0.19.3 scipy-1.7.3 sentencepiece-0.1.95 sktime-0.11.4 smart-open-5.2.1 sortedcontainers-2.4.0 spacy-3.4.2 spacy-legacy-3.0.10 spacy-loggers-1.0.3 srsly-2.4.5 statsmodels-0.13.2 tbats-1.1.1 tblib-1.7.0 tensorboard-2.10.1 tensorboard-data-server-0.6.1 tensorboard-plugin-wit-1.8.1 tensorboardX-2.5.1 thinc-8.1.5 tifffile-2021.11.2 timm-0.5.4 tokenizers-0.12.1 toolz-0.12.0 torch-1.12.1 torchmetrics-0.7.3 torchtext-0.13.1 torchvision-0.13.1 tqdm-4.64.1 transformers-4.20.1 typer-0.4.2 typing-extensions-4.1.1 typish-1.9.3 virtualenv-20.16.2 wasabi-0.10.1 wrapt-1.14.1 xgboost-1.4.2 yacs-0.1.8 yarl-1.8.1 zict-2.2.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
# create the .kaggle directory and an empty kaggle.json file
# !mkdir -p /root/.kaggle
# !touch /root/.kaggle/kaggle.json
# !chmod 600 /root/.kaggle/kaggle.json
# !more /root/.kaggle/kaggle.json
# Fill in your user name and key from creating the kaggle account and API token file
# import json
# kaggle_username = "jeff93"
# kaggle_key = "30761de14c08905e668a106774fcbdff"
# # Save API token the kaggle.json file
# with open("/root/.kaggle/kaggle.json", "w") as f:
# f.write(json.dumps({"username": kaggle_username, "key": kaggle_key}))
!python3 -m pip install kaggle
# Download the dataset, it will be in a .zip file so you'll need to unzip it as well.
# !kaggle competitions download -c bike-sharing-demand
# If you already downloaded it you can use the -o command to overwrite the file
# !unzip -o bike-sharing-demand.zip
Collecting kaggle Using cached kaggle-1.5.12-py3-none-any.whl Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from kaggle) (2.8.2) Requirement already satisfied: tqdm in /usr/local/lib/python3.7/site-packages (from kaggle) (4.64.1) Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/site-packages (from kaggle) (1.25.11) Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/site-packages (from kaggle) (1.16.0) Requirement already satisfied: certifi in /usr/local/lib/python3.7/site-packages (from kaggle) (2021.10.8) Requirement already satisfied: requests in /usr/local/lib/python3.7/site-packages (from kaggle) (2.22.0) Collecting python-slugify Using cached python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB) Collecting text-unidecode>=1.3 Using cached text_unidecode-1.3-py2.py3-none-any.whl (78 kB) Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/site-packages (from requests->kaggle) (3.0.4) Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.7/site-packages (from requests->kaggle) (2.8) Installing collected packages: text-unidecode, python-slugify, kaggle Successfully installed kaggle-1.5.12 python-slugify-6.1.2 text-unidecode-1.3 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import graphviz
from sklearn.tree import plot_tree
from datetime import datetime
import autogluon
from autogluon.tabular import TabularPredictor
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
# Create the train dataset in pandas by reading the csv
# Set the parsing of the datetime column so you can use some of the `dt` features in pandas later
train = pd.read_csv("./train.csv")
train.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 |
# Simple output of the train dataset to view some of the min/max/varition of the dataset features.
train.describe()
| season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.00000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 |
| mean | 2.506614 | 0.028569 | 0.680875 | 1.418427 | 20.23086 | 23.655084 | 61.886460 | 12.799395 | 36.021955 | 155.552177 | 191.574132 |
| std | 1.116174 | 0.166599 | 0.466159 | 0.633839 | 7.79159 | 8.474601 | 19.245033 | 8.164537 | 49.960477 | 151.039033 | 181.144454 |
| min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.82000 | 0.760000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 2.000000 | 0.000000 | 0.000000 | 1.000000 | 13.94000 | 16.665000 | 47.000000 | 7.001500 | 4.000000 | 36.000000 | 42.000000 |
| 50% | 3.000000 | 0.000000 | 1.000000 | 1.000000 | 20.50000 | 24.240000 | 62.000000 | 12.998000 | 17.000000 | 118.000000 | 145.000000 |
| 75% | 4.000000 | 0.000000 | 1.000000 | 2.000000 | 26.24000 | 31.060000 | 77.000000 | 16.997900 | 49.000000 | 222.000000 | 284.000000 |
| max | 4.000000 | 1.000000 | 1.000000 | 4.000000 | 41.00000 | 45.455000 | 100.000000 | 56.996900 | 367.000000 | 886.000000 | 977.000000 |
# Create the test pandas dataframe in pandas by reading the csv, remember to parse the datetime!
test = pd.read_csv("./test.csv")
test.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-20 00:00:00 | 1 | 0 | 1 | 1 | 10.66 | 11.365 | 56 | 26.0027 |
| 1 | 2011-01-20 01:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 |
| 2 | 2011-01-20 02:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 |
| 3 | 2011-01-20 03:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 |
| 4 | 2011-01-20 04:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 |
# Same thing as train and test dataset
submission = pd.read_csv("./sampleSubmission.csv")
submission.head()
| datetime | count | |
|---|---|---|
| 0 | 2011-01-20 00:00:00 | 0 |
| 1 | 2011-01-20 01:00:00 | 0 |
| 2 | 2011-01-20 02:00:00 | 0 |
| 3 | 2011-01-20 03:00:00 | 0 |
| 4 | 2011-01-20 04:00:00 | 0 |
# checking for nulls in training data
train.isna().sum()
datetime 0 season 0 holiday 0 workingday 0 weather 0 temp 0 atemp 0 humidity 0 windspeed 0 casual 0 registered 0 count 0 dtype: int64
# dropping columns from training dataset which are not present in test dataset
train.drop(columns = ["casual", "registered"], inplace=True)
train.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | count | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 1 |
Requirements:
count, so it is the label we are setting.casual and registered columns as they are also not present in the test dataset. root_mean_squared_error as the metric to use for evaluation.best_quality to focus on creating the best model.predictor = TabularPredictor(label="count", eval_metric="root_mean_squared_error").fit(train,
time_limit=600,
presets="best_quality",
auto_stack=True,
num_bag_folds=5,
num_bag_sets=2,
num_stack_levels=3
)
No path specified. Models will be saved in: "AutogluonModels/ag-20221025_024102/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=2
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels/ag-20221025_024102/"
AutoGluon Version: 0.5.2
Python Version: 3.7.10
Operating System: Linux
Train Data Rows: 10886
Train Data Columns: 9
Label Column: count
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
Label info (max, min, mean, stddev): (977, 1, 191.57413, 181.14445)
If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 3026.34 MB
Train Data (Original) Memory Usage: 1.52 MB (0.1% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Fitting DatetimeFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 3 | ['temp', 'atemp', 'windspeed']
('int', []) : 5 | ['season', 'holiday', 'workingday', 'weather', 'humidity']
('object', ['datetime_as_object']) : 1 | ['datetime']
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 3 | ['temp', 'atemp', 'windspeed']
('int', []) : 3 | ['season', 'weather', 'humidity']
('int', ['bool']) : 2 | ['holiday', 'workingday']
('int', ['datetime_as_int']) : 5 | ['datetime', 'datetime.year', 'datetime.month', 'datetime.day', 'datetime.dayofweek']
0.1s = Fit runtime
9 features in original data used to generate 13 features in processed data.
Train Data (Processed) Memory Usage: 0.98 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.18s ...
AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'
This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.
To change this, specify the eval_metric parameter of Predictor()
AutoGluon will fit 4 stack levels (L1 to L4) ...
Fitting 11 L1 models ...
Fitting model: KNeighborsUnif_BAG_L1 ... Training model for up to 199.89s of the 599.82s of remaining time.
-101.5462 = Validation score (-root_mean_squared_error)
0.03s = Training runtime
0.1s = Validation runtime
Fitting model: KNeighborsDist_BAG_L1 ... Training model for up to 199.5s of the 599.42s of remaining time.
-84.1251 = Validation score (-root_mean_squared_error)
0.03s = Training runtime
0.1s = Validation runtime
Fitting model: LightGBMXT_BAG_L1 ... Training model for up to 199.1s of the 599.03s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
2022-10-25 02:41:05,006 WARNING services.py:2013 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 416284672 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=0.97gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM.
-132.6718 = Validation score (-root_mean_squared_error)
31.4s = Training runtime
4.22s = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 160.49s of the 560.42s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-131.9626 = Validation score (-root_mean_squared_error)
16.6s = Training runtime
1.25s = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 140.51s of the 540.44s of remaining time.
-116.5443 = Validation score (-root_mean_squared_error)
10.78s = Training runtime
0.54s = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 126.42s of the 526.34s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-131.4292 = Validation score (-root_mean_squared_error)
77.16s = Training runtime
0.09s = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L1 ... Training model for up to 46.26s of the 446.19s of remaining time.
-124.5881 = Validation score (-root_mean_squared_error)
4.84s = Training runtime
0.52s = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ... Training model for up to 38.25s of the 438.18s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-138.2875 = Validation score (-root_mean_squared_error)
45.63s = Training runtime
0.34s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 389.58s of remaining time.
-84.1251 = Validation score (-root_mean_squared_error)
0.63s = Training runtime
0.0s = Validation runtime
Fitting 9 L2 models ...
Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 172.79s of the 388.86s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-61.1812 = Validation score (-root_mean_squared_error)
31.47s = Training runtime
3.08s = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 137.61s of the 353.68s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-55.1779 = Validation score (-root_mean_squared_error)
13.63s = Training runtime
0.2s = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 121.07s of the 337.13s of remaining time.
-53.5467 = Validation score (-root_mean_squared_error)
25.49s = Training runtime
0.58s = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 92.46s of the 308.53s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-56.0424 = Validation score (-root_mean_squared_error)
52.41s = Training runtime
0.07s = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L2 ... Training model for up to 37.27s of the 253.34s of remaining time.
-53.783 = Validation score (-root_mean_squared_error)
7.84s = Training runtime
0.57s = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L2 ... Training model for up to 26.31s of the 242.37s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-55.0497 = Validation score (-root_mean_squared_error)
34.88s = Training runtime
0.37s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.0s of the 204.45s of remaining time.
-52.2905 = Validation score (-root_mean_squared_error)
0.39s = Training runtime
0.0s = Validation runtime
Fitting 9 L3 models ...
Fitting model: LightGBMXT_BAG_L3 ... Training model for up to 135.95s of the 203.96s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-54.1735 = Validation score (-root_mean_squared_error)
11.82s = Training runtime
0.14s = Validation runtime
Fitting model: LightGBM_BAG_L3 ... Training model for up to 121.15s of the 189.16s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-52.9014 = Validation score (-root_mean_squared_error)
11.77s = Training runtime
0.06s = Validation runtime
Fitting model: RandomForestMSE_BAG_L3 ... Training model for up to 106.39s of the 174.4s of remaining time.
-53.2436 = Validation score (-root_mean_squared_error)
24.79s = Training runtime
0.62s = Validation runtime
Fitting model: CatBoost_BAG_L3 ... Training model for up to 78.44s of the 146.45s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-52.391 = Validation score (-root_mean_squared_error)
16.7s = Training runtime
0.04s = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L3 ... Training model for up to 58.99s of the 127.0s of remaining time.
-52.713 = Validation score (-root_mean_squared_error)
7.55s = Training runtime
0.59s = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L3 ... Training model for up to 48.23s of the 116.23s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-52.8623 = Validation score (-root_mean_squared_error)
50.48s = Training runtime
0.37s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L4 ... Training model for up to 360.0s of the 62.66s of remaining time.
-52.0675 = Validation score (-root_mean_squared_error)
0.4s = Training runtime
0.0s = Validation runtime
Fitting 9 L4 models ...
Fitting model: LightGBMXT_BAG_L4 ... Training model for up to 62.17s of the 62.15s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-54.0258 = Validation score (-root_mean_squared_error)
11.87s = Training runtime
0.1s = Validation runtime
Fitting model: LightGBM_BAG_L4 ... Training model for up to 46.56s of the 46.54s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-53.9538 = Validation score (-root_mean_squared_error)
11.93s = Training runtime
0.08s = Validation runtime
Fitting model: RandomForestMSE_BAG_L4 ... Training model for up to 31.68s of the 31.66s of remaining time.
-54.0206 = Validation score (-root_mean_squared_error)
25.1s = Training runtime
0.6s = Validation runtime
Fitting model: CatBoost_BAG_L4 ... Training model for up to 3.45s of the 3.43s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-58.0821 = Validation score (-root_mean_squared_error)
8.3s = Training runtime
0.04s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L5 ... Training model for up to 360.0s of the -7.82s of remaining time.
-53.204 = Validation score (-root_mean_squared_error)
0.28s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 608.32s ... Best model: "WeightedEnsemble_L4"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20221025_024102/")
predictor.fit_summary()
*** Summary of fit() ***
Estimated performance of each model:
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L4 -52.067544 13.729840 463.894532 0.001125 0.398277 4 True 23
1 WeightedEnsemble_L3 -52.290478 8.897230 268.715461 0.000737 0.387902 3 True 16
2 CatBoost_BAG_L3 -52.390974 12.090207 368.909167 0.041911 16.704700 3 True 20
3 ExtraTreesMSE_BAG_L3 -52.713042 12.641901 359.758229 0.593604 7.553763 3 True 21
4 NeuralNetFastAI_BAG_L3 -52.862330 12.414234 402.681698 0.365938 50.477232 3 True 22
5 LightGBM_BAG_L3 -52.901360 12.112232 363.971588 0.063936 11.767122 3 True 18
6 WeightedEnsemble_L5 -53.204038 14.687260 532.800778 0.000784 0.279360 5 True 28
7 RandomForestMSE_BAG_L3 -53.243589 12.663326 376.993439 0.615030 24.788972 3 True 19
8 RandomForestMSE_BAG_L2 -53.546740 7.754898 211.969776 0.582245 25.488053 2 True 12
9 ExtraTreesMSE_BAG_L2 -53.783029 7.746575 194.326414 0.573921 7.844691 2 True 14
10 LightGBM_BAG_L4 -53.953795 13.944231 487.246849 0.075230 11.927903 4 True 25
11 RandomForestMSE_BAG_L4 -54.020574 14.471696 500.415167 0.602694 25.096221 4 True 26
12 LightGBMXT_BAG_L4 -54.025818 13.969836 487.193233 0.100834 11.874287 4 True 24
13 LightGBMXT_BAG_L3 -54.173493 12.188583 364.027157 0.140287 11.822690 3 True 17
14 NeuralNetFastAI_BAG_L2 -55.049654 7.542790 221.360577 0.370137 34.878853 2 True 15
15 LightGBM_BAG_L2 -55.177912 7.370190 200.115961 0.197536 13.634238 2 True 11
16 CatBoost_BAG_L2 -56.042398 7.247432 238.888609 0.074778 52.406886 2 True 13
17 CatBoost_BAG_L4 -58.082147 13.907718 483.623006 0.038717 8.304060 4 True 27
18 LightGBMXT_BAG_L2 -61.181244 10.249679 217.951745 3.077025 31.470022 2 True 10
19 KNeighborsDist_BAG_L1 -84.125061 0.103631 0.031427 0.103631 0.031427 1 True 2
20 WeightedEnsemble_L2 -84.125061 0.104777 0.660337 0.001146 0.628910 2 True 9
21 KNeighborsUnif_BAG_L1 -101.546199 0.102882 0.032213 0.102882 0.032213 1 True 1
22 RandomForestMSE_BAG_L1 -116.544294 0.535544 10.780408 0.535544 10.780408 1 True 5
23 ExtraTreesMSE_BAG_L1 -124.588053 0.523520 4.838314 0.523520 4.838314 1 True 7
24 CatBoost_BAG_L1 -131.429158 0.087753 77.162732 0.087753 77.162732 1 True 6
25 LightGBM_BAG_L1 -131.962589 1.254450 16.601223 1.254450 16.601223 1 True 4
26 LightGBMXT_BAG_L1 -132.671834 4.220521 31.404651 4.220521 31.404651 1 True 3
27 NeuralNetFastAI_BAG_L1 -138.287536 0.344353 45.630755 0.344353 45.630755 1 True 8
Number of models trained: 28
Types of models trained:
{'StackerEnsembleModel_LGB', 'StackerEnsembleModel_NNFastAiTabular', 'StackerEnsembleModel_KNN', 'StackerEnsembleModel_XT', 'StackerEnsembleModel_RF', 'StackerEnsembleModel_CatBoost', 'WeightedEnsembleModel'}
Bagging used: True (with 5 folds)
Multi-layer stack-ensembling used: True (with 5 levels)
Feature Metadata (Processed):
(raw dtype, special dtypes):
('float', []) : 3 | ['temp', 'atemp', 'windspeed']
('int', []) : 3 | ['season', 'weather', 'humidity']
('int', ['bool']) : 2 | ['holiday', 'workingday']
('int', ['datetime_as_int']) : 5 | ['datetime', 'datetime.year', 'datetime.month', 'datetime.day', 'datetime.dayofweek']
Plot summary of models saved to file: AutogluonModels/ag-20221025_024102/SummaryOfModels.html
*** End of fit() summary ***
{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L1': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
'ExtraTreesMSE_BAG_L1': 'StackerEnsembleModel_XT',
'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
'WeightedEnsemble_L2': 'WeightedEnsembleModel',
'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L2': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
'ExtraTreesMSE_BAG_L2': 'StackerEnsembleModel_XT',
'NeuralNetFastAI_BAG_L2': 'StackerEnsembleModel_NNFastAiTabular',
'WeightedEnsemble_L3': 'WeightedEnsembleModel',
'LightGBMXT_BAG_L3': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L3': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L3': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L3': 'StackerEnsembleModel_CatBoost',
'ExtraTreesMSE_BAG_L3': 'StackerEnsembleModel_XT',
'NeuralNetFastAI_BAG_L3': 'StackerEnsembleModel_NNFastAiTabular',
'WeightedEnsemble_L4': 'WeightedEnsembleModel',
'LightGBMXT_BAG_L4': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L4': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L4': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L4': 'StackerEnsembleModel_CatBoost',
'WeightedEnsemble_L5': 'WeightedEnsembleModel'},
'model_performance': {'KNeighborsUnif_BAG_L1': -101.54619908446061,
'KNeighborsDist_BAG_L1': -84.12506123181602,
'LightGBMXT_BAG_L1': -132.67183388581174,
'LightGBM_BAG_L1': -131.96258851906953,
'RandomForestMSE_BAG_L1': -116.54429428704391,
'CatBoost_BAG_L1': -131.42915810353512,
'ExtraTreesMSE_BAG_L1': -124.58805258915959,
'NeuralNetFastAI_BAG_L1': -138.287536370842,
'WeightedEnsemble_L2': -84.12506123181602,
'LightGBMXT_BAG_L2': -61.18124365520546,
'LightGBM_BAG_L2': -55.17791215059886,
'RandomForestMSE_BAG_L2': -53.54673971898559,
'CatBoost_BAG_L2': -56.04239772048334,
'ExtraTreesMSE_BAG_L2': -53.78302884857821,
'NeuralNetFastAI_BAG_L2': -55.04965378845021,
'WeightedEnsemble_L3': -52.29047813968845,
'LightGBMXT_BAG_L3': -54.173493083851106,
'LightGBM_BAG_L3': -52.90136016546189,
'RandomForestMSE_BAG_L3': -53.243588808979055,
'CatBoost_BAG_L3': -52.390973809644535,
'ExtraTreesMSE_BAG_L3': -52.71304150024172,
'NeuralNetFastAI_BAG_L3': -52.86232975855977,
'WeightedEnsemble_L4': -52.06754401638973,
'LightGBMXT_BAG_L4': -54.02581778766236,
'LightGBM_BAG_L4': -53.95379459785943,
'RandomForestMSE_BAG_L4': -54.02057442112101,
'CatBoost_BAG_L4': -58.082146793330104,
'WeightedEnsemble_L5': -53.204037907620396},
'model_best': 'WeightedEnsemble_L4',
'model_paths': {'KNeighborsUnif_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/KNeighborsUnif_BAG_L1/',
'KNeighborsDist_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/KNeighborsDist_BAG_L1/',
'LightGBMXT_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/LightGBMXT_BAG_L1/',
'LightGBM_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/LightGBM_BAG_L1/',
'RandomForestMSE_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/RandomForestMSE_BAG_L1/',
'CatBoost_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/CatBoost_BAG_L1/',
'ExtraTreesMSE_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/ExtraTreesMSE_BAG_L1/',
'NeuralNetFastAI_BAG_L1': 'AutogluonModels/ag-20221025_024102/models/NeuralNetFastAI_BAG_L1/',
'WeightedEnsemble_L2': 'AutogluonModels/ag-20221025_024102/models/WeightedEnsemble_L2/',
'LightGBMXT_BAG_L2': 'AutogluonModels/ag-20221025_024102/models/LightGBMXT_BAG_L2/',
'LightGBM_BAG_L2': 'AutogluonModels/ag-20221025_024102/models/LightGBM_BAG_L2/',
'RandomForestMSE_BAG_L2': 'AutogluonModels/ag-20221025_024102/models/RandomForestMSE_BAG_L2/',
'CatBoost_BAG_L2': 'AutogluonModels/ag-20221025_024102/models/CatBoost_BAG_L2/',
'ExtraTreesMSE_BAG_L2': 'AutogluonModels/ag-20221025_024102/models/ExtraTreesMSE_BAG_L2/',
'NeuralNetFastAI_BAG_L2': 'AutogluonModels/ag-20221025_024102/models/NeuralNetFastAI_BAG_L2/',
'WeightedEnsemble_L3': 'AutogluonModels/ag-20221025_024102/models/WeightedEnsemble_L3/',
'LightGBMXT_BAG_L3': 'AutogluonModels/ag-20221025_024102/models/LightGBMXT_BAG_L3/',
'LightGBM_BAG_L3': 'AutogluonModels/ag-20221025_024102/models/LightGBM_BAG_L3/',
'RandomForestMSE_BAG_L3': 'AutogluonModels/ag-20221025_024102/models/RandomForestMSE_BAG_L3/',
'CatBoost_BAG_L3': 'AutogluonModels/ag-20221025_024102/models/CatBoost_BAG_L3/',
'ExtraTreesMSE_BAG_L3': 'AutogluonModels/ag-20221025_024102/models/ExtraTreesMSE_BAG_L3/',
'NeuralNetFastAI_BAG_L3': 'AutogluonModels/ag-20221025_024102/models/NeuralNetFastAI_BAG_L3/',
'WeightedEnsemble_L4': 'AutogluonModels/ag-20221025_024102/models/WeightedEnsemble_L4/',
'LightGBMXT_BAG_L4': 'AutogluonModels/ag-20221025_024102/models/LightGBMXT_BAG_L4/',
'LightGBM_BAG_L4': 'AutogluonModels/ag-20221025_024102/models/LightGBM_BAG_L4/',
'RandomForestMSE_BAG_L4': 'AutogluonModels/ag-20221025_024102/models/RandomForestMSE_BAG_L4/',
'CatBoost_BAG_L4': 'AutogluonModels/ag-20221025_024102/models/CatBoost_BAG_L4/',
'WeightedEnsemble_L5': 'AutogluonModels/ag-20221025_024102/models/WeightedEnsemble_L5/'},
'model_fit_times': {'KNeighborsUnif_BAG_L1': 0.032213449478149414,
'KNeighborsDist_BAG_L1': 0.03142690658569336,
'LightGBMXT_BAG_L1': 31.404650688171387,
'LightGBM_BAG_L1': 16.60122323036194,
'RandomForestMSE_BAG_L1': 10.780408143997192,
'CatBoost_BAG_L1': 77.16273212432861,
'ExtraTreesMSE_BAG_L1': 4.838313579559326,
'NeuralNetFastAI_BAG_L1': 45.63075518608093,
'WeightedEnsemble_L2': 0.6289098262786865,
'LightGBMXT_BAG_L2': 31.470021724700928,
'LightGBM_BAG_L2': 13.634238004684448,
'RandomForestMSE_BAG_L2': 25.4880530834198,
'CatBoost_BAG_L2': 52.406885862350464,
'ExtraTreesMSE_BAG_L2': 7.844691038131714,
'NeuralNetFastAI_BAG_L2': 34.87885332107544,
'WeightedEnsemble_L3': 0.38790202140808105,
'LightGBMXT_BAG_L3': 11.822690486907959,
'LightGBM_BAG_L3': 11.7671217918396,
'RandomForestMSE_BAG_L3': 24.7889723777771,
'CatBoost_BAG_L3': 16.704700231552124,
'ExtraTreesMSE_BAG_L3': 7.553762912750244,
'NeuralNetFastAI_BAG_L3': 50.47723174095154,
'WeightedEnsemble_L4': 0.39827680587768555,
'LightGBMXT_BAG_L4': 11.874287128448486,
'LightGBM_BAG_L4': 11.927903413772583,
'RandomForestMSE_BAG_L4': 25.09622097015381,
'CatBoost_BAG_L4': 8.304060459136963,
'WeightedEnsemble_L5': 0.2793598175048828},
'model_pred_times': {'KNeighborsUnif_BAG_L1': 0.10288166999816895,
'KNeighborsDist_BAG_L1': 0.10363101959228516,
'LightGBMXT_BAG_L1': 4.220521450042725,
'LightGBM_BAG_L1': 1.2544498443603516,
'RandomForestMSE_BAG_L1': 0.5355441570281982,
'CatBoost_BAG_L1': 0.0877525806427002,
'ExtraTreesMSE_BAG_L1': 0.52351975440979,
'NeuralNetFastAI_BAG_L1': 0.34435296058654785,
'WeightedEnsemble_L2': 0.0011458396911621094,
'LightGBMXT_BAG_L2': 3.0770251750946045,
'LightGBM_BAG_L2': 0.19753646850585938,
'RandomForestMSE_BAG_L2': 0.582244873046875,
'CatBoost_BAG_L2': 0.07477831840515137,
'ExtraTreesMSE_BAG_L2': 0.5739212036132812,
'NeuralNetFastAI_BAG_L2': 0.37013673782348633,
'WeightedEnsemble_L3': 0.0007369518280029297,
'LightGBMXT_BAG_L3': 0.14028668403625488,
'LightGBM_BAG_L3': 0.06393551826477051,
'RandomForestMSE_BAG_L3': 0.61503005027771,
'CatBoost_BAG_L3': 0.041910648345947266,
'ExtraTreesMSE_BAG_L3': 0.593604326248169,
'NeuralNetFastAI_BAG_L3': 0.3659381866455078,
'WeightedEnsemble_L4': 0.001125335693359375,
'LightGBMXT_BAG_L4': 0.10083413124084473,
'LightGBM_BAG_L4': 0.07522964477539062,
'RandomForestMSE_BAG_L4': 0.602694034576416,
'CatBoost_BAG_L4': 0.038716793060302734,
'WeightedEnsemble_L5': 0.0007836818695068359},
'num_bag_folds': 5,
'max_stack_level': 5,
'model_hyperparams': {'KNeighborsUnif_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'KNeighborsDist_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'LightGBMXT_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'ExtraTreesMSE_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'NeuralNetFastAI_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L2': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMXT_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'ExtraTreesMSE_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'NeuralNetFastAI_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L3': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMXT_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'ExtraTreesMSE_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'NeuralNetFastAI_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L4': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMXT_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L5': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True}},
'leaderboard': model score_val pred_time_val fit_time \
0 WeightedEnsemble_L4 -52.067544 13.729840 463.894532
1 WeightedEnsemble_L3 -52.290478 8.897230 268.715461
2 CatBoost_BAG_L3 -52.390974 12.090207 368.909167
3 ExtraTreesMSE_BAG_L3 -52.713042 12.641901 359.758229
4 NeuralNetFastAI_BAG_L3 -52.862330 12.414234 402.681698
5 LightGBM_BAG_L3 -52.901360 12.112232 363.971588
6 WeightedEnsemble_L5 -53.204038 14.687260 532.800778
7 RandomForestMSE_BAG_L3 -53.243589 12.663326 376.993439
8 RandomForestMSE_BAG_L2 -53.546740 7.754898 211.969776
9 ExtraTreesMSE_BAG_L2 -53.783029 7.746575 194.326414
10 LightGBM_BAG_L4 -53.953795 13.944231 487.246849
11 RandomForestMSE_BAG_L4 -54.020574 14.471696 500.415167
12 LightGBMXT_BAG_L4 -54.025818 13.969836 487.193233
13 LightGBMXT_BAG_L3 -54.173493 12.188583 364.027157
14 NeuralNetFastAI_BAG_L2 -55.049654 7.542790 221.360577
15 LightGBM_BAG_L2 -55.177912 7.370190 200.115961
16 CatBoost_BAG_L2 -56.042398 7.247432 238.888609
17 CatBoost_BAG_L4 -58.082147 13.907718 483.623006
18 LightGBMXT_BAG_L2 -61.181244 10.249679 217.951745
19 KNeighborsDist_BAG_L1 -84.125061 0.103631 0.031427
20 WeightedEnsemble_L2 -84.125061 0.104777 0.660337
21 KNeighborsUnif_BAG_L1 -101.546199 0.102882 0.032213
22 RandomForestMSE_BAG_L1 -116.544294 0.535544 10.780408
23 ExtraTreesMSE_BAG_L1 -124.588053 0.523520 4.838314
24 CatBoost_BAG_L1 -131.429158 0.087753 77.162732
25 LightGBM_BAG_L1 -131.962589 1.254450 16.601223
26 LightGBMXT_BAG_L1 -132.671834 4.220521 31.404651
27 NeuralNetFastAI_BAG_L1 -138.287536 0.344353 45.630755
pred_time_val_marginal fit_time_marginal stack_level can_infer \
0 0.001125 0.398277 4 True
1 0.000737 0.387902 3 True
2 0.041911 16.704700 3 True
3 0.593604 7.553763 3 True
4 0.365938 50.477232 3 True
5 0.063936 11.767122 3 True
6 0.000784 0.279360 5 True
7 0.615030 24.788972 3 True
8 0.582245 25.488053 2 True
9 0.573921 7.844691 2 True
10 0.075230 11.927903 4 True
11 0.602694 25.096221 4 True
12 0.100834 11.874287 4 True
13 0.140287 11.822690 3 True
14 0.370137 34.878853 2 True
15 0.197536 13.634238 2 True
16 0.074778 52.406886 2 True
17 0.038717 8.304060 4 True
18 3.077025 31.470022 2 True
19 0.103631 0.031427 1 True
20 0.001146 0.628910 2 True
21 0.102882 0.032213 1 True
22 0.535544 10.780408 1 True
23 0.523520 4.838314 1 True
24 0.087753 77.162732 1 True
25 1.254450 16.601223 1 True
26 4.220521 31.404651 1 True
27 0.344353 45.630755 1 True
fit_order
0 23
1 16
2 20
3 21
4 22
5 18
6 28
7 19
8 12
9 14
10 25
11 26
12 24
13 17
14 15
15 11
16 13
17 27
18 10
19 2
20 9
21 1
22 5
23 7
24 6
25 4
26 3
27 8 }
predictor.leaderboard()
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order 0 WeightedEnsemble_L4 -52.067544 13.729840 463.894532 0.001125 0.398277 4 True 23 1 WeightedEnsemble_L3 -52.290478 8.897230 268.715461 0.000737 0.387902 3 True 16 2 CatBoost_BAG_L3 -52.390974 12.090207 368.909167 0.041911 16.704700 3 True 20 3 ExtraTreesMSE_BAG_L3 -52.713042 12.641901 359.758229 0.593604 7.553763 3 True 21 4 NeuralNetFastAI_BAG_L3 -52.862330 12.414234 402.681698 0.365938 50.477232 3 True 22 5 LightGBM_BAG_L3 -52.901360 12.112232 363.971588 0.063936 11.767122 3 True 18 6 WeightedEnsemble_L5 -53.204038 14.687260 532.800778 0.000784 0.279360 5 True 28 7 RandomForestMSE_BAG_L3 -53.243589 12.663326 376.993439 0.615030 24.788972 3 True 19 8 RandomForestMSE_BAG_L2 -53.546740 7.754898 211.969776 0.582245 25.488053 2 True 12 9 ExtraTreesMSE_BAG_L2 -53.783029 7.746575 194.326414 0.573921 7.844691 2 True 14 10 LightGBM_BAG_L4 -53.953795 13.944231 487.246849 0.075230 11.927903 4 True 25 11 RandomForestMSE_BAG_L4 -54.020574 14.471696 500.415167 0.602694 25.096221 4 True 26 12 LightGBMXT_BAG_L4 -54.025818 13.969836 487.193233 0.100834 11.874287 4 True 24 13 LightGBMXT_BAG_L3 -54.173493 12.188583 364.027157 0.140287 11.822690 3 True 17 14 NeuralNetFastAI_BAG_L2 -55.049654 7.542790 221.360577 0.370137 34.878853 2 True 15 15 LightGBM_BAG_L2 -55.177912 7.370190 200.115961 0.197536 13.634238 2 True 11 16 CatBoost_BAG_L2 -56.042398 7.247432 238.888609 0.074778 52.406886 2 True 13 17 CatBoost_BAG_L4 -58.082147 13.907718 483.623006 0.038717 8.304060 4 True 27 18 LightGBMXT_BAG_L2 -61.181244 10.249679 217.951745 3.077025 31.470022 2 True 10 19 KNeighborsDist_BAG_L1 -84.125061 0.103631 0.031427 0.103631 0.031427 1 True 2 20 WeightedEnsemble_L2 -84.125061 0.104777 0.660337 0.001146 0.628910 2 True 9 21 KNeighborsUnif_BAG_L1 -101.546199 0.102882 0.032213 0.102882 0.032213 1 True 1 22 RandomForestMSE_BAG_L1 -116.544294 0.535544 10.780408 0.535544 10.780408 1 True 5 23 ExtraTreesMSE_BAG_L1 -124.588053 0.523520 4.838314 0.523520 4.838314 1 True 7 24 CatBoost_BAG_L1 -131.429158 0.087753 77.162732 0.087753 77.162732 1 True 6 25 LightGBM_BAG_L1 -131.962589 1.254450 16.601223 1.254450 16.601223 1 True 4 26 LightGBMXT_BAG_L1 -132.671834 4.220521 31.404651 4.220521 31.404651 1 True 3 27 NeuralNetFastAI_BAG_L1 -138.287536 0.344353 45.630755 0.344353 45.630755 1 True 8
| model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | WeightedEnsemble_L4 | -52.067544 | 13.729840 | 463.894532 | 0.001125 | 0.398277 | 4 | True | 23 |
| 1 | WeightedEnsemble_L3 | -52.290478 | 8.897230 | 268.715461 | 0.000737 | 0.387902 | 3 | True | 16 |
| 2 | CatBoost_BAG_L3 | -52.390974 | 12.090207 | 368.909167 | 0.041911 | 16.704700 | 3 | True | 20 |
| 3 | ExtraTreesMSE_BAG_L3 | -52.713042 | 12.641901 | 359.758229 | 0.593604 | 7.553763 | 3 | True | 21 |
| 4 | NeuralNetFastAI_BAG_L3 | -52.862330 | 12.414234 | 402.681698 | 0.365938 | 50.477232 | 3 | True | 22 |
| 5 | LightGBM_BAG_L3 | -52.901360 | 12.112232 | 363.971588 | 0.063936 | 11.767122 | 3 | True | 18 |
| 6 | WeightedEnsemble_L5 | -53.204038 | 14.687260 | 532.800778 | 0.000784 | 0.279360 | 5 | True | 28 |
| 7 | RandomForestMSE_BAG_L3 | -53.243589 | 12.663326 | 376.993439 | 0.615030 | 24.788972 | 3 | True | 19 |
| 8 | RandomForestMSE_BAG_L2 | -53.546740 | 7.754898 | 211.969776 | 0.582245 | 25.488053 | 2 | True | 12 |
| 9 | ExtraTreesMSE_BAG_L2 | -53.783029 | 7.746575 | 194.326414 | 0.573921 | 7.844691 | 2 | True | 14 |
| 10 | LightGBM_BAG_L4 | -53.953795 | 13.944231 | 487.246849 | 0.075230 | 11.927903 | 4 | True | 25 |
| 11 | RandomForestMSE_BAG_L4 | -54.020574 | 14.471696 | 500.415167 | 0.602694 | 25.096221 | 4 | True | 26 |
| 12 | LightGBMXT_BAG_L4 | -54.025818 | 13.969836 | 487.193233 | 0.100834 | 11.874287 | 4 | True | 24 |
| 13 | LightGBMXT_BAG_L3 | -54.173493 | 12.188583 | 364.027157 | 0.140287 | 11.822690 | 3 | True | 17 |
| 14 | NeuralNetFastAI_BAG_L2 | -55.049654 | 7.542790 | 221.360577 | 0.370137 | 34.878853 | 2 | True | 15 |
| 15 | LightGBM_BAG_L2 | -55.177912 | 7.370190 | 200.115961 | 0.197536 | 13.634238 | 2 | True | 11 |
| 16 | CatBoost_BAG_L2 | -56.042398 | 7.247432 | 238.888609 | 0.074778 | 52.406886 | 2 | True | 13 |
| 17 | CatBoost_BAG_L4 | -58.082147 | 13.907718 | 483.623006 | 0.038717 | 8.304060 | 4 | True | 27 |
| 18 | LightGBMXT_BAG_L2 | -61.181244 | 10.249679 | 217.951745 | 3.077025 | 31.470022 | 2 | True | 10 |
| 19 | KNeighborsDist_BAG_L1 | -84.125061 | 0.103631 | 0.031427 | 0.103631 | 0.031427 | 1 | True | 2 |
| 20 | WeightedEnsemble_L2 | -84.125061 | 0.104777 | 0.660337 | 0.001146 | 0.628910 | 2 | True | 9 |
| 21 | KNeighborsUnif_BAG_L1 | -101.546199 | 0.102882 | 0.032213 | 0.102882 | 0.032213 | 1 | True | 1 |
| 22 | RandomForestMSE_BAG_L1 | -116.544294 | 0.535544 | 10.780408 | 0.535544 | 10.780408 | 1 | True | 5 |
| 23 | ExtraTreesMSE_BAG_L1 | -124.588053 | 0.523520 | 4.838314 | 0.523520 | 4.838314 | 1 | True | 7 |
| 24 | CatBoost_BAG_L1 | -131.429158 | 0.087753 | 77.162732 | 0.087753 | 77.162732 | 1 | True | 6 |
| 25 | LightGBM_BAG_L1 | -131.962589 | 1.254450 | 16.601223 | 1.254450 | 16.601223 | 1 | True | 4 |
| 26 | LightGBMXT_BAG_L1 | -132.671834 | 4.220521 | 31.404651 | 4.220521 | 31.404651 | 1 | True | 3 |
| 27 | NeuralNetFastAI_BAG_L1 | -138.287536 | 0.344353 | 45.630755 | 0.344353 | 45.630755 | 1 | True | 8 |
predictor.evaluate(train)
Evaluation: root_mean_squared_error on test data: -86.38217520565071
Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
"root_mean_squared_error": -86.38217520565071,
"mean_squared_error": -7461.880193259736,
"mean_absolute_error": -55.55438399602884,
"r2": 0.7725750618995305,
"pearsonr": 0.9384259600525297,
"median_absolute_error": -29.758590698242188
}
{'root_mean_squared_error': -86.38217520565071,
'mean_squared_error': -7461.880193259736,
'mean_absolute_error': -55.55438399602884,
'r2': 0.7725750618995305,
'pearsonr': 0.9384259600525297,
'median_absolute_error': -29.758590698242188}
# grab the predictions from the training set
train_preds=predictor.predict(train.iloc[:,:-1])
train_preds[train_preds < 0] = 0
# predictions and target are both log values
train_preds_log = np.log(train_preds + 1)
train_counts_log = np.log(train.iloc[:,-1]+1)
train_errors = (np.sqrt(abs(train_counts_log - train_preds_log))).mean()
print(f"training_RMSLE: {train_errors}")
training_RMSLE: 0.60200915342046
predictions = predictor.predict(test)
predictions.head()
0 27.657717 1 41.931282 2 43.978527 3 48.194435 4 50.486752 Name: count, dtype: float32
# Describe the `predictions` series to see if there are any negative values
predictions.describe()
count 6493.000000 mean 100.548416 std 88.464478 min 4.180945 25% 22.018381 50% 61.643513 75% 171.016632 max 359.307861 Name: count, dtype: float64
# How many negative values do we have?
predictions[predictions < 0].sum()
0.0
submission["count"] = predictions
submission.to_csv("submission.csv", index=False)
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "first raw submission"
100%|█████████████████████████████████████████| 188k/188k [00:00<00:00, 414kB/s] Successfully submitted to Bike Sharing Demand
My Submissions¶!kaggle competitions submissions -c bike-sharing-demand | tail -n +1 | head -n 3
fileName date description status publicScore privateScore -------------- ------------------- --------------------------------------------------- -------- ----------- ------------ submission.csv 2022-10-25 02:52:44 first raw submission complete 1.76328 1.76328 tail: write error: Broken pipe
1.7632¶Let us start by taking a look at the feature names in the dataset
train.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
'atemp', 'humidity', 'windspeed', 'count'],
dtype='object')
# Create a histogram of all features to show the distribution of each one relative to the data. This is part of the exploritory data analysis
%matplotlib inline
plt.figure(figsize=(16, 9))
train.hist()
array([[<AxesSubplot:title={'center':'season'}>,
<AxesSubplot:title={'center':'holiday'}>,
<AxesSubplot:title={'center':'workingday'}>],
[<AxesSubplot:title={'center':'weather'}>,
<AxesSubplot:title={'center':'temp'}>,
<AxesSubplot:title={'center':'atemp'}>],
[<AxesSubplot:title={'center':'humidity'}>,
<AxesSubplot:title={'center':'windspeed'}>,
<AxesSubplot:title={'center':'count'}>]], dtype=object)
<Figure size 1152x648 with 0 Axes>
Evidently there are some features which are correlated -
holiday and workingday are negatively correlated because a holiday means it is not a workingday and vice versa; these features can be combined into one.weather seems to be correlated with count. This is interesting because the weather column is a ordinal value as of now, This needs to be changed into a categorical feature to make more sense. A weather value of 1 doesn't really mean anything. Also a weather value of 1 isn't really greater than a weather value of 4. They refer to different types of weather conditions.temp and atemp are also correlated features, as when actual temperatures are higher, it will also be perveived like higher temperatures. So we can assume that only one of these features can be included in the model, depends upon which one of them has a higher covariance with the target.humidity is a good one as if the weather is more humid, people are less likely to take bikes. will have to be one-hot encodedwindspeed is also a categorical feature which can be one-hot encodedseason is also a categorical feature which can be one-hot encodeddatetime can be parsed to give out time of the day and further analysis can be done to figure out rush-hour times versus non-rush-hour times.holiday and workingday features¶print(f"holiday value counts:\n {train.holiday.value_counts()}")
print(f"workingday value counts:\n {train.workingday.value_counts()}")
holiday value counts: 0 10575 1 311 Name: holiday, dtype: int64 workingday value counts: 1 7412 0 3474 Name: workingday, dtype: int64
sns.barplot(data=train, x="holiday", y="count")
<AxesSubplot:xlabel='holiday', ylabel='count'>
Now since holiday is a categorical feature, we can one-hot encode it, and remove the original feature since it is not ordinal in nature.
# one hot encode the holiday categorical feature
train = train.join(pd.get_dummies(train.holiday, prefix="holiday_"))
Now since workingday is a categorical feature, we can one-hot encode it, and remove the original feature since it is not ordinal in nature.
sns.barplot(data=train, x="workingday", y="count")
<AxesSubplot:xlabel='workingday', ylabel='count'>
# one hot encode the workingday feature and add it to the train df
train = train.join(pd.get_dummies(train.workingday, prefix="workingday_"))
# we can then remove the original workingday feature
train.drop(columns=["workingday", "holiday"], inplace=True)
train.head()
| datetime | season | weather | temp | atemp | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 1 | 9.84 | 14.395 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 |
| 1 | 2011-01-01 01:00:00 | 1 | 1 | 9.02 | 13.635 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 |
| 2 | 2011-01-01 02:00:00 | 1 | 1 | 9.02 | 13.635 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 |
| 3 | 2011-01-01 03:00:00 | 1 | 1 | 9.84 | 14.395 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 |
| 4 | 2011-01-01 04:00:00 | 1 | 1 | 9.84 | 14.395 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 |
weather feature¶# checking the distribution of values in weather
train.weather.value_counts()
1 7192 2 2834 3 859 4 1 Name: weather, dtype: int64
It is slightly alarming to see that there is only 1 sample for weather = 4
From kaggle it is found that the descriptions for weather are:
It is evident that category 3 and 4 can be clustered together into one - Bad Weather. Category 1 refers to good clear weather, and Category 2 refers to Cloudy weather. Moreover Season type 4 has only 1 sample in the training data, so it does not really add much value to our analysis.
So in total we can have 3 Categories:
train.loc[train.weather == 2, "weather"]
5 2
13 2
14 2
15 2
16 2
..
10835 2
10836 2
10838 2
10839 2
10840 2
Name: weather, Length: 2834, dtype: int64
# one hot encoding the three weather columns and removing the original weather column
train["weather_clear"] = 0
train.loc[train.weather == 1, "weather_clear"] = 1
train["weather_bad"] = 0
train.loc[train.weather == 2, "weather_bad"] = 1
train["weather_cloudy"] = 0
train.loc[(train.weather == 3) | (train.weather == 4), "weather_cloudy"] = 1
train.drop(columns=["weather"], inplace=True)
train.head()
| datetime | season | temp | atemp | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 9.84 | 14.395 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 1 | 2011-01-01 01:00:00 | 1 | 9.02 | 13.635 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 2 | 2011-01-01 02:00:00 | 1 | 9.02 | 13.635 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 3 | 2011-01-01 03:00:00 | 1 | 9.84 | 14.395 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 4 | 2011-01-01 04:00:00 | 1 | 9.84 | 14.395 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
season feature¶From Kaggle it can be seen that the season feature is categorized as follows:
Which means this is also a categorical feature which needs to be encoded, as the values are not ordinal.
First let us check the distribution of season and count values.
It is evident that as the season values change, indicating a change in season from 1 to 4, the count values also change, with an increase during summer and fall, and a decrease during winter and spring. So there is a direct relationship between the season and the bike demand counts
sns.barplot(data = train, x = "season", y = "count")
<AxesSubplot:xlabel='season', ylabel='count'>
Let us one-hot encode these features, add them to our dataset and then remove the original feature
train = train.join(pd.get_dummies(train.season, prefix="season_"))
train.drop(columns=["season"], inplace=True)
train.head()
| datetime | temp | atemp | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | season__3 | season__4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 9.84 | 14.395 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2011-01-01 01:00:00 | 9.02 | 13.635 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 2011-01-01 02:00:00 | 9.02 | 13.635 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 2011-01-01 03:00:00 | 9.84 | 14.395 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 2011-01-01 04:00:00 | 9.84 | 14.395 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
atemp and temp columns¶First let us check the distribution of atemp and count values.
sns.scatterplot(data=train, x = "atemp", y = "count")
<AxesSubplot:xlabel='atemp', ylabel='count'>
There seems to be a roughly positive correlation to temperature and counts. However this isn't clear.
It is evident here that the atemp values are discrete in nature even though it is supposed to be continuous. We can thus discretize these values into buckets by creating one-hot vectors.
We can identify the buckets for categorizing the feature using a decision tree model in order to perform feature binning.
The idea is to find the best set of buckets or bins using a decision tree model that will involve correlation with the target variable.
X_features = train[["atemp"]]
y_label = train["count"].values
params = {
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,5,10]
}
clf_dt = DecisionTreeRegressor()
clf = GridSearchCV(clf_dt, param_grid=params, scoring="neg_root_mean_squared_error")
clf.fit(X_features, y_label)
GridSearchCV(estimator=DecisionTreeRegressor(),
param_grid={'max_depth': [2, 4, 6, 8, 10, 12],
'min_samples_split': [2, 3, 5, 10]},
scoring='neg_root_mean_squared_error')
clf.best_params_
{'max_depth': 6, 'min_samples_split': 2}
clf_dt = DecisionTreeRegressor()
clf_dt.max_depth = 6
clf_dt.min_samples_split = 2
clf_dt.fit(X_features, y_label)
DecisionTreeRegressor(max_depth=6)
%matplotlib inline
plt.figure(figsize=(100, 100))
plot_tree(clf_dt, filled=True, feature_names=list(X_features.columns))
plt.show()
Based on the above, we can see that the bins are structured as follows, since we have 6 nodes in the tree, we can bin the temperatures into 6 categories, which exhibit the highest entropy based splits:
1: values from 0-7.52: values from 7.5-153: values from 15-22.54: values from 22.5-30 5: values from 30-37.56: values from 37.5-46train["atemp_1"] = 0
train.loc[train.atemp <=7.5 , "atemp_1"] = 1
train["atemp_2"] = 0
train.loc[(train.atemp > 7.5) & (train.atemp <= 15), "atemp_2"] = 1
train["atemp_3"] = 0
train.loc[(train.atemp > 15) & (train.atemp <= 22.5), "atemp_3"] = 1
train["atemp_4"] = 0
train.loc[(train.atemp > 22.5) & (train.atemp <= 30), "atemp_4"] = 1
train["atemp_5"] = 0
train.loc[(train.atemp > 30) & (train.atemp <= 37.5), "atemp_5"] = 1
train["atemp_6"] = 0
train.loc[(train.atemp > 37.5) & (train.atemp <= 46), "atemp_6"] = 1
We can now discard the original atemp and temp columns, however let us check the correlation between the two.
print(f"Correlation coefficient between atemp and temp is: {train.atemp.corr(train.temp)}")
sns.regplot(x=train.atemp, y=train.temp)
Correlation coefficient between atemp and temp is: 0.9849481104817066
<AxesSubplot:xlabel='atemp', ylabel='temp'>
As is evident, these 2 are highly correlated. But the feature that we need to keep is the one that has a higher covariance with the target.
sns.set(rc = {'figure.figsize':(16,8)})
sns.heatmap(train.corr(), annot = True, fmt='.2g',cmap= 'coolwarm')
<AxesSubplot:>
Since atemp and temp are both similarly correlated with the target, we can remove them both and have the one-hot encoded features.
train.drop(columns=["atemp", "temp"], inplace=True)
train.head()
| datetime | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | ... | season__1 | season__2 | season__3 | season__4 | atemp_1 | atemp_2 | atemp_3 | atemp_4 | atemp_5 | atemp_6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 2011-01-01 01:00:00 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 2011-01-01 02:00:00 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 2011-01-01 03:00:00 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 2011-01-01 04:00:00 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
5 rows × 21 columns
datetime feature¶The datetime feature contains the datetime stamp in the format: YYYY-MM-DD HH:MM:SS
There are a few features than can be generated from this field, which we will corroborate with the following hypotheses:
train.datetime
0 2011-01-01 00:00:00
1 2011-01-01 01:00:00
2 2011-01-01 02:00:00
3 2011-01-01 03:00:00
4 2011-01-01 04:00:00
...
10881 2012-12-19 19:00:00
10882 2012-12-19 20:00:00
10883 2012-12-19 21:00:00
10884 2012-12-19 22:00:00
10885 2012-12-19 23:00:00
Name: datetime, Length: 10886, dtype: object
# train.datetime.values[0].split(" ")[1].split(":")[0]
train.datetime.values[0].split(" ")[0].split("-")[2]
'01'
# define an auxillary function to extract datetime, given a string containing the YYYY-MM-DD HH:MM:SS format
def extract_hour(timestamp):
hour = timestamp.split(" ")[1].split(":")[0]
hour = int(hour) # convert to integer type for easy calculation later on
return hour
def extract_year(timestamp):
year = timestamp.split(" ")[0].split("-")[0]
year = int(year) # convert to integer type for easy calculation later on
return year
def extract_month(timestamp):
month = timestamp.split(" ")[0].split("-")[1]
month = int(month) # convert to integer type for easy calculation later on
return month
def extract_day(timestamp):
day = timestamp.split(" ")[0].split("-")[2]
day = int(day) # convert to integer type for easy calculation later on
return day
# add the hour to the dataframe as a new feature
train["time_of_day"] = train.datetime.apply(extract_hour).values
# add the year to the dataframe as a new feature
train["year"] = train.datetime.apply(extract_year).values
# add the month to the dataframe as a new feature
train["month"] = train.datetime.apply(extract_month).values
# add the day to the dataframe as a new feature
train["day"] = train.datetime.apply(extract_day).values
train.head()
| datetime | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | ... | atemp_1 | atemp_2 | atemp_3 | atemp_4 | atemp_5 | atemp_6 | time_of_day | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2011 | 1 | 1 |
| 1 | 2011-01-01 01:00:00 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 2011 | 1 | 1 |
| 2 | 2011-01-01 02:00:00 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 2011 | 1 | 1 |
| 3 | 2011-01-01 03:00:00 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 2011 | 1 | 1 |
| 4 | 2011-01-01 04:00:00 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 4 | 2011 | 1 | 1 |
5 rows × 25 columns
First we start with the year feature we just generated. Let us check for annual trends in bike demand.
sns.barplot(data=train, x="year", y="count")
<AxesSubplot:xlabel='year', ylabel='count'>
It is evident that there was a significant rise in bike demand in 2012. This may be due to external factors which we do not have access to, however since we have datapoints from 2011 in our test data, it makes sense that the year will play a part in predicting bike demand.
We need to convert the year into a categorical feature as well.
Currently we have integer values in year, which needs to be mapped into categorical features. We can one-hot encode these into new features, and drop the original year column.
train = train.join(pd.get_dummies(train.year, prefix="year_"))
train.drop(columns=["year"], inplace=True)
train.head()
| datetime | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | ... | atemp_2 | atemp_3 | atemp_4 | atemp_5 | atemp_6 | time_of_day | month | day | year__2011 | year__2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 |
| 1 | 2011-01-01 01:00:00 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 |
| 2 | 2011-01-01 02:00:00 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 1 | 0 |
| 3 | 2011-01-01 03:00:00 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 1 | 0 |
| 4 | 2011-01-01 04:00:00 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 4 | 1 | 1 | 1 | 0 |
5 rows × 26 columns
Now let us look at the seasonal/monthly trends of count
sns.barplot(data=train, x="month", y="count")
<AxesSubplot:xlabel='month', ylabel='count'>
As is evident, there is an obvious monthly/seasonal trend in bike demand as well, with more bikes being rented out from June - Sept.
We can model this variation by having categorical features denote the month as well.
train = train.join(pd.get_dummies(train.month, prefix="month_"))
train.drop(columns=["month"], inplace=True)
train.head()
| datetime | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | ... | month__3 | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 2011-01-01 01:00:00 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 2011-01-01 02:00:00 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 2011-01-01 03:00:00 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 2011-01-01 04:00:00 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 37 columns
Now let us look at the daily trends of count
sns.barplot(data=train, x="day", y="count")
<AxesSubplot:xlabel='day', ylabel='count'>
This isn't really showing us much, as there aren't any obvious patterns or trends here. A better comparision would be to check whether there are trends between count and a weekday/weekend, and since the 1st of every month (or 2nd, 3rd etc) aren't always a weekday (or weekend) we need to figure out a way to extract that information first.
In order to do that we can use the $datetime$ module
train.datetime
0 2011-01-01 00:00:00
1 2011-01-01 01:00:00
2 2011-01-01 02:00:00
3 2011-01-01 03:00:00
4 2011-01-01 04:00:00
...
10881 2012-12-19 19:00:00
10882 2012-12-19 20:00:00
10883 2012-12-19 21:00:00
10884 2012-12-19 22:00:00
10885 2012-12-19 23:00:00
Name: datetime, Length: 10886, dtype: object
# convert the datetime values from string type to datetime type
train.datetime = train.datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
# find out the weekdays
# add as a new feature
train["day_of_week"] = train.datetime.apply(lambda x: x.weekday())
train.head()
| datetime | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | ... | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | day_of_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 1 | 2011-01-01 01:00:00 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 2 | 2011-01-01 02:00:00 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 3 | 2011-01-01 03:00:00 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
| 4 | 2011-01-01 04:00:00 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
5 rows × 38 columns
Let us check the trend based on weekday/weekend with count
sns.barplot(data=train, x="day_of_week", y="count")
<AxesSubplot:xlabel='day_of_week', ylabel='count'>
This seems to show more of a consistent trend, with:
This seems to be consistent with the idea that more people want bikes on weekdays rather than weekends.
We can thus bin these into 3 buckets:
# Mon-Wed
train["early_week"] = 0
train.loc[train.day_of_week.between(0,2), "early_week"] = 1
# Thu-Sat
train["pre_weekend"] = 0
train.loc[train.day_of_week.between(3,5), "pre_weekend"] = 1
# Sunday
train["sunday"] = 0
train.loc[train.day_of_week == 6, "sunday"] = 1
train.drop(columns=["day_of_week", "day"], inplace=True)
train.head()
| datetime | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | ... | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | early_week | pre_weekend | sunday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 2011-01-01 01:00:00 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 2011-01-01 02:00:00 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 2011-01-01 03:00:00 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 2011-01-01 04:00:00 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 39 columns
train.columns
Index(['datetime', 'humidity', 'windspeed', 'count', 'holiday__0',
'holiday__1', 'workingday__0', 'workingday__1', 'weather_clear',
'weather_bad', 'weather_cloudy', 'season__1', 'season__2', 'season__3',
'season__4', 'atemp_1', 'atemp_2', 'atemp_3', 'atemp_4', 'atemp_5',
'atemp_6', 'time_of_day', 'year__2011', 'year__2012', 'month__1',
'month__2', 'month__3', 'month__4', 'month__5', 'month__6', 'month__7',
'month__8', 'month__9', 'month__10', 'month__11', 'month__12',
'early_week', 'pre_weekend', 'sunday'],
dtype='object')
Now based on the time_of_day feature we can engineer other new features, which will be categorical so we will have to one-hot encode them as well.
We start by figuring out the logic for the bins.
We find out the bins for the time of day similar to how we did for the atemp feature. We use a decision tree regressor to figure out the nodes of best split.
X_features = train[["time_of_day"]]
y_label = train["count"].values
params = {
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,5,10]
}
clf_dt = DecisionTreeRegressor()
clf = GridSearchCV(clf_dt, param_grid=params, scoring="neg_root_mean_squared_error")
clf.fit(X_features, y_label)
clf.best_params_
{'max_depth': 8, 'min_samples_split': 2}
clf_dt = DecisionTreeRegressor()
clf_dt.max_depth = 5
clf_dt.min_samples_split = 2
clf_dt.fit(X_features, y_label)
DecisionTreeRegressor(max_depth=5)
%matplotlib inline
plt.figure(figsize=(100, 100))
plot_tree(clf_dt, filled=True, feature_names=list(X_features.columns))
plt.show()
Let us corroborate this node split with the trends seen hourly as well
sns.barplot(data = train, x="time_of_day", y="count")
<AxesSubplot:xlabel='time_of_day', ylabel='count'>
According to both the plots we can safely assume that the bins for the hourly trends look like:
As is evident the rush hour times see the most spike in demand. So we can use this as a feature.
# let us create the new columns
train["time_0"] = 0
train.loc[(train.time_of_day > 0) & (train.time_of_day <= 4.5), "time_0"] = 1
train["time_1"] = 0
train.loc[(train.time_of_day > 4.5) & (train.time_of_day <= 6.5), "time_1"] = 1
train["time_2"] = 0
train.loc[(train.time_of_day > 6.5) & (train.time_of_day <= 8.5), "time_2"] = 1
train["time_3"] = 0
train.loc[(train.time_of_day > 8.5) & (train.time_of_day <= 15.5), "time_3"] = 1
train["time_4"] = 0
train.loc[(train.time_of_day > 15.5) & (train.time_of_day <= 18.5), "time_4"] = 1
train["time_5"] = 0
train.loc[(train.time_of_day > 18.5) & (train.time_of_day <= 20.5), "time_5"] = 1
train["time_6"] = 0
train.loc[(train.time_of_day > 20.5) & (train.time_of_day <= 22.5), "time_6"] = 1
train["time_7"] = 0
train.loc[(train.time_of_day > 22.5) & (train.time_of_day <= 24), "time_7"] = 1
# and we can remove the original time_of_day feature
train.drop(columns=["time_of_day"], inplace = True)
train.head()
| datetime | humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | ... | pre_weekend | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 2011-01-01 01:00:00 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 2011-01-01 02:00:00 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 2011-01-01 03:00:00 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 2011-01-01 04:00:00 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 46 columns
humidity and windspeed features.¶From what it looks like, it might be better to leave humidity and windspeed as continuous variables rather than convert them into categorical. However since they are on a different scale from the other one-hot encoded features, it is imperative that they be standard-normalized so that they don't get extra weightage while the models train.
# before we can perform standardization we need to remove the datetime column as well since it is a string
train.drop(columns=["datetime"], inplace=True)
train.head()
| humidity | windspeed | count | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | ... | pre_weekend | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81 | 0.0 | 16 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 80 | 0.0 | 40 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 80 | 0.0 | 32 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 75 | 0.0 | 13 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 75 | 0.0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 45 columns
Let us look at the distributions of humidity and windpeed
sns.histplot(data=train, x="humidity", kde=True)
<AxesSubplot:xlabel='humidity', ylabel='Count'>
sns.histplot(data=train, x="windspeed", kde=True)
<AxesSubplot:xlabel='windspeed', ylabel='Count'>
Looks like humidity is almost normally distributed, so we can just standardize those values using min-max scaling.
However windspeed seems quite skewed. We can normalize windspeed, but there are quite a few 0s which seem like they are outliers. We can treat them as missing values/error values, and based on other features which affect windspeed, estimate the value of windspeed for those values.
We can then check the distribution to see if that has improved the distribution to be more normal.
We can make a preprocessing pipeline for the other features so that we can do a one-step preprocess of the train and test data, which can then be used to model both the count predictor as well as for the windspeed predictor
for col in train.columns:
if col not in ['humidity', 'windspeed', 'count']:
train[col] = train[col].astype("category")
# View histogram of all features again now with the hour feature
train.hist()
array([[<AxesSubplot:title={'center':'humidity'}>,
<AxesSubplot:title={'center':'windspeed'}>],
[<AxesSubplot:title={'center':'count'}>, <AxesSubplot:>]],
dtype=object)
count label parametercount label parameter# define an auxillary function to extract datetime, given a string containing the YYYY-MM-DD HH:MM:SS format
def extract_hour(timestamp):
hour = timestamp.split(" ")[1].split(":")[0]
hour = int(hour) # convert to integer type for easy calculation later on
return hour
def extract_year(timestamp):
year = timestamp.split(" ")[0].split("-")[0]
year = int(year) # convert to integer type for easy calculation later on
return year
def extract_month(timestamp):
month = timestamp.split(" ")[0].split("-")[1]
month = int(month) # convert to integer type for easy calculation later on
return month
def extract_day(timestamp):
day = timestamp.split(" ")[0].split("-")[2]
day = int(day) # convert to integer type for easy calculation later on
return day
def preprocess_data(df):
# one hot encode the holiday feature and add it to the train df
df = df.join(pd.get_dummies(df.holiday, prefix="holiday_"))
df.drop(columns=["holiday"], inplace=True)
# one hot encode the workingday feature and add it to the train df
df = df.join(pd.get_dummies(df.workingday, prefix="workingday_"))
# we can then remove the original workingday feature
df.drop(columns=["workingday"], inplace=True)
# one hot encoding the three weather columns and removing the original weather column
df["weather_clear"] = 0
df.loc[df.weather == 1, "weather_clear"] = 1
df["weather_bad"] = 0
df.loc[df.weather == 2, "weather_bad"] = 1
df["weather_cloudy"] = 0
df.loc[(df.weather == 3) | (df.weather == 4), "weather_cloudy"] = 1
df.drop(columns=["weather"], inplace=True)
# one hot encoding the season column
df = df.join(pd.get_dummies(df.season, prefix="season_"))
df.drop(columns=["season"], inplace=True)
# one hot encoding the temp bins
df["atemp_1"] = 0
df.loc[df.atemp <=7.5 , "atemp_1"] = 1
df["atemp_2"] = 0
df.loc[(df.atemp > 7.5) & (df.atemp <= 15), "atemp_2"] = 1
df["atemp_3"] = 0
df.loc[(df.atemp > 15) & (df.atemp <= 22.5), "atemp_3"] = 1
df["atemp_4"] = 0
df.loc[(df.atemp > 22.5) & (df.atemp <= 30), "atemp_4"] = 1
df["atemp_5"] = 0
df.loc[(df.atemp > 30) & (df.atemp <= 37.5), "atemp_5"] = 1
df["atemp_6"] = 0
df.loc[(df.atemp > 37.5) & (df.atemp <= 46), "atemp_6"] = 1
df.drop(columns=["atemp", "temp"], inplace=True)
# add the hour to the dataframe as a new feature
df["time_of_day"] = df.datetime.apply(extract_hour).values
# add the year to the dataframe as a new feature
df["year"] = df.datetime.apply(extract_year).values
# add the month to the dataframe as a new feature
df["month"] = df.datetime.apply(extract_month).values
# add the day to the dataframe as a new feature
df["day"] = df.datetime.apply(extract_day).values
# one hot encode the year feature
df = df.join(pd.get_dummies(df.year, prefix="year_"))
df.drop(columns=["year"], inplace=True)
# one hot encode the month feature
df = df.join(pd.get_dummies(df.month, prefix="month_"))
df.drop(columns=["month"], inplace=True)
# convert the datetime values from string type to datetime type
df.datetime = df.datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
# find out the weekdays
# add as a new feature
df["day_of_week"] = df.datetime.apply(lambda x: x.weekday())
# Mon-Wed
df["early_week"] = 0
df.loc[df.day_of_week.between(0,2), "early_week"] = 1
# Thu-Sat
df["pre_weekend"] = 0
df.loc[df.day_of_week.between(3,5), "pre_weekend"] = 1
# Sunday
df["sunday"] = 0
df.loc[df.day_of_week == 6, "sunday"] = 1
df.drop(columns=["day_of_week", "day"], inplace=True)
# let us create the new columns
# ont hot encode the time of day feature
df["time_0"] = 0
df.loc[(df.time_of_day > 0) & (df.time_of_day <= 4.5), "time_0"] = 1
df["time_1"] = 0
df.loc[(df.time_of_day > 4.5) & (df.time_of_day <= 6.5), "time_1"] = 1
df["time_2"] = 0
df.loc[(df.time_of_day > 6.5) & (df.time_of_day <= 8.5), "time_2"] = 1
df["time_3"] = 0
df.loc[(df.time_of_day > 8.5) & (df.time_of_day <= 15.5), "time_3"] = 1
df["time_4"] = 0
df.loc[(df.time_of_day > 15.5) & (df.time_of_day <= 18.5), "time_4"] = 1
df["time_5"] = 0
df.loc[(df.time_of_day > 18.5) & (df.time_of_day <= 20.5), "time_5"] = 1
df["time_6"] = 0
df.loc[(df.time_of_day > 20.5) & (df.time_of_day <= 22.5), "time_6"] = 1
df["time_7"] = 0
df.loc[(df.time_of_day > 22.5) & (df.time_of_day <= 24), "time_7"] = 1
# and we can remove the original time_of_day feature
df.drop(columns=["time_of_day"], inplace = True)
# before we can perform standardization we need to remove the datetime column as well since it is a string
df.drop(columns=["datetime"], inplace=True)
return df
# read in the required raw datasets
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")
# dropping columns from training dataset which are not present in test dataset
train_df.drop(columns = ["casual", "registered"], inplace=True)
train_df.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | count | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 1 |
test_df.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-20 00:00:00 | 1 | 0 | 1 | 1 | 10.66 | 11.365 | 56 | 26.0027 |
| 1 | 2011-01-20 01:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 |
| 2 | 2011-01-20 02:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 |
| 3 | 2011-01-20 03:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 |
| 4 | 2011-01-20 04:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 |
# train_ws_max = train_df.windspeed.max()
train_hum_max = train_df.humidity.max()
# train_ws_min = train_df.windspeed.min()
train_hum_min = train_df.humidity.min()
# we cannot perform scaling differently on the test set, it has to be min-max scaled based on the train samples
# standardize the humidity column using min-max scaling
normalized_df=(train_df[["humidity"]]-train_hum_min)/(train_hum_max-train_hum_min)
train_df.drop(columns=["humidity"], inplace=True)
train_df = train_df.join(normalized_df)
train_df.head()
| datetime | season | holiday | workingday | weather | temp | atemp | windspeed | count | humidity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 0.0 | 16 | 0.81 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 0.0 | 40 | 0.80 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 0.0 | 32 | 0.80 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 0.0 | 13 | 0.75 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 0.0 | 1 | 0.75 |
# now use the min and max values from the train df to scale the test variables
normalized_df=(test_df[["humidity"]]-train_hum_min)/(train_hum_max-train_hum_min)
test_df.drop(columns=["humidity"], inplace=True)
test_df = test_df.join(normalized_df)
test_df.head()
| datetime | season | holiday | workingday | weather | temp | atemp | windspeed | humidity | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-20 00:00:00 | 1 | 0 | 1 | 1 | 10.66 | 11.365 | 26.0027 | 0.56 |
| 1 | 2011-01-20 01:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 0.0000 | 0.56 |
| 2 | 2011-01-20 02:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 0.0000 | 0.56 |
| 3 | 2011-01-20 03:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 11.0014 | 0.56 |
| 4 | 2011-01-20 04:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 11.0014 | 0.56 |
preprocessed_train_data = preprocess_data(train_df)
preprocessed_test_data = preprocess_data(test_df)
preprocessed_train_data.head()
| windspeed | count | humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | ... | pre_weekend | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 16 | 0.81 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0.0 | 40 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0.0 | 32 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0.0 | 13 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0.0 | 1 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 45 columns
# cast to categorical datatype
for col in preprocessed_train_data.columns:
if col not in ['humidity', 'windspeed', 'count']:
preprocessed_train_data[col] = preprocessed_train_data[col].astype("category")
preprocessed_train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 windspeed 10886 non-null float64 1 count 10886 non-null int64 2 humidity 10886 non-null float64 3 holiday__0 10886 non-null category 4 holiday__1 10886 non-null category 5 workingday__0 10886 non-null category 6 workingday__1 10886 non-null category 7 weather_clear 10886 non-null category 8 weather_bad 10886 non-null category 9 weather_cloudy 10886 non-null category 10 season__1 10886 non-null category 11 season__2 10886 non-null category 12 season__3 10886 non-null category 13 season__4 10886 non-null category 14 atemp_1 10886 non-null category 15 atemp_2 10886 non-null category 16 atemp_3 10886 non-null category 17 atemp_4 10886 non-null category 18 atemp_5 10886 non-null category 19 atemp_6 10886 non-null category 20 year__2011 10886 non-null category 21 year__2012 10886 non-null category 22 month__1 10886 non-null category 23 month__2 10886 non-null category 24 month__3 10886 non-null category 25 month__4 10886 non-null category 26 month__5 10886 non-null category 27 month__6 10886 non-null category 28 month__7 10886 non-null category 29 month__8 10886 non-null category 30 month__9 10886 non-null category 31 month__10 10886 non-null category 32 month__11 10886 non-null category 33 month__12 10886 non-null category 34 early_week 10886 non-null category 35 pre_weekend 10886 non-null category 36 sunday 10886 non-null category 37 time_0 10886 non-null category 38 time_1 10886 non-null category 39 time_2 10886 non-null category 40 time_3 10886 non-null category 41 time_4 10886 non-null category 42 time_5 10886 non-null category 43 time_6 10886 non-null category 44 time_7 10886 non-null category dtypes: category(42), float64(2), int64(1) memory usage: 706.8 KB
Same for test data
preprocessed_test_data.head()
| windspeed | humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | ... | pre_weekend | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26.0027 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0.0000 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0.0000 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 11.0014 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 11.0014 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 44 columns
# cast to categorical datatype
for col in preprocessed_test_data.columns:
if col not in ['humidity', 'windspeed']:
preprocessed_test_data[col] = preprocessed_test_data[col].astype("category")
preprocessed_test_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6493 entries, 0 to 6492 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 windspeed 6493 non-null float64 1 humidity 6493 non-null float64 2 holiday__0 6493 non-null category 3 holiday__1 6493 non-null category 4 workingday__0 6493 non-null category 5 workingday__1 6493 non-null category 6 weather_clear 6493 non-null category 7 weather_bad 6493 non-null category 8 weather_cloudy 6493 non-null category 9 season__1 6493 non-null category 10 season__2 6493 non-null category 11 season__3 6493 non-null category 12 season__4 6493 non-null category 13 atemp_1 6493 non-null category 14 atemp_2 6493 non-null category 15 atemp_3 6493 non-null category 16 atemp_4 6493 non-null category 17 atemp_5 6493 non-null category 18 atemp_6 6493 non-null category 19 year__2011 6493 non-null category 20 year__2012 6493 non-null category 21 month__1 6493 non-null category 22 month__2 6493 non-null category 23 month__3 6493 non-null category 24 month__4 6493 non-null category 25 month__5 6493 non-null category 26 month__6 6493 non-null category 27 month__7 6493 non-null category 28 month__8 6493 non-null category 29 month__9 6493 non-null category 30 month__10 6493 non-null category 31 month__11 6493 non-null category 32 month__12 6493 non-null category 33 early_week 6493 non-null category 34 pre_weekend 6493 non-null category 35 sunday 6493 non-null category 36 time_0 6493 non-null category 37 time_1 6493 non-null category 38 time_2 6493 non-null category 39 time_3 6493 non-null category 40 time_4 6493 non-null category 41 time_5 6493 non-null category 42 time_6 6493 non-null category 43 time_7 6493 non-null category dtypes: category(42), float64(2) memory usage: 373.0 KB
windspeed feature¶We first combine the train and test data for this
combined_df = preprocessed_train_data.append(preprocessed_test_data, ignore_index=True)
combined_df.head()
combined_df.drop(columns=['count'], inplace=True)
combined_df.describe()
| windspeed | humidity | |
|---|---|---|
| count | 17379.000000 | 17379.000000 |
| mean | 12.736540 | 0.627229 |
| std | 8.196795 | 0.192930 |
| min | 0.000000 | 0.000000 |
| 25% | 7.001500 | 0.480000 |
| 50% | 12.998000 | 0.630000 |
| 75% | 16.997900 | 0.780000 |
| max | 56.996900 | 1.000000 |
combined_df.columns
Index(['windspeed', 'humidity', 'holiday__0', 'holiday__1', 'workingday__0',
'workingday__1', 'weather_clear', 'weather_bad', 'weather_cloudy',
'season__1', 'season__2', 'season__3', 'season__4', 'atemp_1',
'atemp_2', 'atemp_3', 'atemp_4', 'atemp_5', 'atemp_6', 'year__2011',
'year__2012', 'month__1', 'month__2', 'month__3', 'month__4',
'month__5', 'month__6', 'month__7', 'month__8', 'month__9', 'month__10',
'month__11', 'month__12', 'early_week', 'pre_weekend', 'sunday',
'time_0', 'time_1', 'time_2', 'time_3', 'time_4', 'time_5', 'time_6',
'time_7'],
dtype='object')
Necessary features for estimating windspeed are:
from scipy.stats import boxcox
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
feature_names = [x for x in list(combined_df.columns) if x not in ["holiday__0", "holiday__1",
"workingday__0", "workingday__1",
"year__2011", "year__2012",
"pre_weekend", "sunday", "early_week",
"time_0",'time_1','time_2','time_3','time_4','time_5','time_6','time_7'
]]
combined_df = combined_df.loc[:, feature_names]
combined_df
| windspeed | humidity | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | season__3 | season__4 | atemp_1 | ... | month__3 | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0000 | 0.81 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0.0000 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0.0000 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0.0000 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0.0000 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17374 | 11.0014 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17375 | 11.0014 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17376 | 11.0014 | 0.60 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17377 | 8.9981 | 0.56 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17378 | 8.9981 | 0.65 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
17379 rows × 27 columns
windspeed_test_set = combined_df.loc[combined_df.windspeed == 0, ]
windspeed_train_set = combined_df.loc[combined_df.windspeed != 0, ]
As we have seen above, the windspeed feature is very skewed. We need to transform it so it approximates a gaussian distribution. This will allow us to estimate it better.
For this purpose we use the box-cox transformation from the scipy module
transformed_windspeed = boxcox(windspeed_train_set["windspeed"].values, lmbda = -0.035)
pd.options.mode.chained_assignment = None
windspeed_train_set["windspeed_transformed"] = transformed_windspeed
windspeed_train_set
| windspeed | humidity | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | season__3 | season__4 | atemp_1 | ... | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 6.0032 | 0.75 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.737234 |
| 10 | 16.9979 | 0.76 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.697158 |
| 11 | 19.0012 | 0.81 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.797856 |
| 12 | 19.0012 | 0.77 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.797856 |
| 13 | 19.9995 | 0.72 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.844006 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17374 | 11.0014 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17375 | 11.0014 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17376 | 11.0014 | 0.60 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17377 | 8.9981 | 0.56 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.114667 |
| 17378 | 8.9981 | 0.65 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.114667 |
15199 rows × 28 columns
sns.histplot(data=windspeed_train_set, x = "windspeed_transformed", kde=True)
<AxesSubplot:xlabel='windspeed_transformed', ylabel='Count'>
windspeed_train_set.drop(columns=["windspeed"],inplace=True)
windspeed_train_set
| humidity | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | season__3 | season__4 | atemp_1 | atemp_2 | ... | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 0.75 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.737234 |
| 10 | 0.76 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.697158 |
| 11 | 0.81 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.797856 |
| 12 | 0.77 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.797856 |
| 13 | 0.72 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.844006 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17374 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17375 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17376 | 0.60 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17377 | 0.56 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.114667 |
| 17378 | 0.65 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.114667 |
15199 rows × 27 columns
Using XGBoost to model the windspeed
X_train, X_val = train_test_split(windspeed_train_set, test_size=0.2, shuffle=True, random_state=43)
dtrain = xgb.DMatrix(
X_train.loc[:, X_train.columns != "windspeed_transformed"], label=X_train["windspeed_transformed"], enable_categorical = True
)
dval = xgb.DMatrix(
X_val.loc[:, X_val.columns != "windspeed_transformed"], label=X_val["windspeed_transformed"], enable_categorical=True
)
param = {
"max_depth": 5,
"eta": 0.03,
"gamma": 0.09,
"colsample_bytree": 0.5,
"objective": "reg:squarederror",
}
num_round = 500
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dval)
sklearn.metrics.mean_absolute_error(X_val["windspeed_transformed"], preds)
0.32160089500676836
using this trained model to estimate the box-cox transformed windspeeds of the test set (the 0 values)
windspeed_test_set.drop(columns=["windspeed"],inplace=True)
windspeed_test_set
| humidity | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | season__3 | season__4 | atemp_1 | atemp_2 | ... | month__3 | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.81 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17319 | 1.00 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17320 | 1.00 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17322 | 0.87 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17330 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17350 | 0.61 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2180 rows × 26 columns
dtest = xgb.DMatrix(
windspeed_test_set.loc[:, windspeed_test_set.columns != "windspeed_transformed"], enable_categorical=True
)
preds_test = bst.predict(dtest)
windspeed_test_set["windspeed_transformed"] = preds_test
windspeed_test_set
| humidity | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | season__3 | season__4 | atemp_1 | atemp_2 | ... | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.81 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 1 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 2 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 3 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.209244 |
| 4 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.209244 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17319 | 1.00 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.633469 |
| 17320 | 1.00 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.633469 |
| 17322 | 0.87 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.410033 |
| 17330 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.465903 |
| 17350 | 0.61 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.355105 |
2180 rows × 27 columns
new_df = windspeed_train_set.append(windspeed_test_set)
new_df.sort_index(inplace=True)
new_df
| humidity | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | season__3 | season__4 | atemp_1 | atemp_2 | ... | month__4 | month__5 | month__6 | month__7 | month__8 | month__9 | month__10 | month__11 | month__12 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.81 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 1 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 2 | 0.80 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 3 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.209244 |
| 4 | 0.75 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.209244 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17374 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17375 | 0.60 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17376 | 0.60 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.300146 |
| 17377 | 0.56 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.114667 |
| 17378 | 0.65 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2.114667 |
17379 rows × 27 columns
pre_processed_train_data = new_df.iloc[:10886,]
windspeed_transformed_train = pre_processed_train_data.windspeed_transformed.values
pre_processed_test_data = new_df.iloc[10886:,]
windspeed_transformed_test = pre_processed_test_data.windspeed_transformed.values
# add these values to the pre-processed dataset before combining
preprocessed_train_data.drop(columns=["windspeed"], inplace=True)
preprocessed_train_data["windspeed_transformed"] = windspeed_transformed_train
preprocessed_train_data.head()
| count | humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | ... | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | 0.81 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 1 | 40 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 2 | 32 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.183253 |
| 3 | 13 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.209244 |
| 4 | 1 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.209244 |
5 rows × 45 columns
preprocessed_test_data.drop(columns=["windspeed"], inplace=True)
preprocessed_test_data["windspeed_transformed"] = windspeed_transformed_test
preprocessed_test_data.head()
| humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | ... | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3.079288 |
| 1 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.367040 |
| 2 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.367040 |
| 3 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.300146 |
| 4 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.300146 |
5 rows × 44 columns
Now we can normalize the windspeed values using min-max transformation
train_ws_max = preprocessed_train_data.windspeed_transformed.max()
train_ws_min = preprocessed_train_data.windspeed_transformed.min()
normalized_df=(preprocessed_train_data[["windspeed_transformed"]]-train_ws_min)/(train_ws_max-train_ws_min)
preprocessed_train_data.drop(columns=["windspeed_transformed"],inplace=True)
preprocessed_train_data = preprocessed_train_data.join(normalized_df)
preprocessed_train_data.head()
| count | humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | ... | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | 0.81 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 |
| 1 | 40 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 |
| 2 | 32 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 |
| 3 | 13 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 |
| 4 | 1 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 |
5 rows × 45 columns
normalized_df=(preprocessed_test_data[["windspeed_transformed"]]-train_ws_min)/(train_ws_max-train_ws_min)
preprocessed_test_data.drop(columns=["windspeed_transformed"],inplace=True)
preprocessed_test_data = preprocessed_test_data.join(normalized_df)
preprocessed_test_data.head()
| humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | ... | sunday | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | windspeed_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.660219 |
| 1 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.309831 |
| 2 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.309831 |
| 3 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.276923 |
| 4 | 0.56 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.276923 |
5 rows × 44 columns
Last step remaining is to transform the count variable since it is also a skewed variable.
However we have to keep in mind that the actual predictions to be submitted need to be transformed back.
sns.histplot(preprocessed_train_data, x="count", kde=True)
<AxesSubplot:xlabel='count', ylabel='Count'>
preprocessed_train_data["count_transformed"] = boxcox(preprocessed_train_data['count'].values, lmbda=0.32)
sns.histplot(preprocessed_train_data, x="count_transformed", kde=True)
<AxesSubplot:xlabel='count_transformed', ylabel='Count'>
Count seems to be normally distributed now with these values, so let us run the model to predict these transformed counts, and then do an inverse boxcox transform on the predicted values to get the actual counts back
preprocessed_train_data.drop(columns=["count"], inplace=True)
preprocessed_train_data.head()
| humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | ... | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | windspeed_transformed | count_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.81 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 4.463718 |
| 1 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 7.049410 |
| 2 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 6.348229 |
| 3 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 | 3.975875 |
| 4 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 | 0.000000 |
5 rows × 45 columns
Run the model again on the new datasets
predictor = TabularPredictor(label="count_transformed", eval_metric="root_mean_squared_error").fit(preprocessed_train_data,
time_limit=1200,
presets="best_quality",
auto_stack=True,
num_bag_folds=5,
num_bag_sets=2,
num_stack_levels=3
)
No path specified. Models will be saved in: "AutogluonModels/ag-20221025_030847/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=2
Beginning AutoGluon training ... Time limit = 1200s
AutoGluon will save models to "AutogluonModels/ag-20221025_030847/"
AutoGluon Version: 0.5.2
Python Version: 3.7.10
Operating System: Linux
Train Data Rows: 10886
Train Data Columns: 44
Label Column: count_transformed
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
Label info (max, min, mean, stddev): (25.16391443532209, 0.0, 11.65516, 5.73301)
If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 2264.0 MB
Train Data (Original) Memory Usage: 0.63 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Note: Converting 42 features to boolean dtype as they only contain 2 unique values.
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('category', []) : 42 | ['holiday__0', 'holiday__1', 'workingday__0', 'workingday__1', 'weather_clear', ...]
('float', []) : 2 | ['humidity', 'windspeed_transformed']
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 2 | ['humidity', 'windspeed_transformed']
('int', ['bool']) : 42 | ['holiday__0', 'holiday__1', 'workingday__0', 'workingday__1', 'weather_clear', ...]
0.2s = Fit runtime
44 features in original data used to generate 44 features in processed data.
Train Data (Processed) Memory Usage: 0.63 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.23s ...
AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'
This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.
To change this, specify the eval_metric parameter of Predictor()
AutoGluon will fit 4 stack levels (L1 to L4) ...
Fitting 11 L1 models ...
Fitting model: KNeighborsUnif_BAG_L1 ... Training model for up to 399.82s of the 1199.76s of remaining time.
-5.7884 = Validation score (-root_mean_squared_error)
0.02s = Training runtime
0.1s = Validation runtime
Fitting model: KNeighborsDist_BAG_L1 ... Training model for up to 399.45s of the 1199.39s of remaining time.
-5.8396 = Validation score (-root_mean_squared_error)
0.01s = Training runtime
0.11s = Validation runtime
Fitting model: LightGBMXT_BAG_L1 ... Training model for up to 399.08s of the 1199.02s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7926 = Validation score (-root_mean_squared_error)
13.17s = Training runtime
0.26s = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 382.11s of the 1182.05s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.8069 = Validation score (-root_mean_squared_error)
11.85s = Training runtime
0.19s = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 367.23s of the 1167.17s of remaining time.
-1.9719 = Validation score (-root_mean_squared_error)
14.07s = Training runtime
0.68s = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 349.94s of the 1149.88s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7961 = Validation score (-root_mean_squared_error)
29.63s = Training runtime
0.07s = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L1 ... Training model for up to 317.41s of the 1117.35s of remaining time.
-1.9815 = Validation score (-root_mean_squared_error)
9.18s = Training runtime
0.62s = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ... Training model for up to 305.13s of the 1105.07s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7936 = Validation score (-root_mean_squared_error)
71.99s = Training runtime
0.41s = Validation runtime
Fitting model: XGBoost_BAG_L1 ... Training model for up to 229.85s of the 1029.79s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.8527 = Validation score (-root_mean_squared_error)
11.98s = Training runtime
0.14s = Validation runtime
Fitting model: NeuralNetTorch_BAG_L1 ... Training model for up to 214.76s of the 1014.7s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.832 = Validation score (-root_mean_squared_error)
100.33s = Training runtime
0.18s = Validation runtime
Fitting model: LightGBMLarge_BAG_L1 ... Training model for up to 111.53s of the 911.47s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.8319 = Validation score (-root_mean_squared_error)
15.51s = Training runtime
0.21s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 892.69s of remaining time.
-1.7667 = Validation score (-root_mean_squared_error)
0.63s = Training runtime
0.0s = Validation runtime
Fitting 9 L2 models ...
Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 396.33s of the 891.95s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7462 = Validation score (-root_mean_squared_error)
15.99s = Training runtime
0.32s = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 377.27s of the 872.89s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7537 = Validation score (-root_mean_squared_error)
13.45s = Training runtime
0.1s = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 360.95s of the 856.57s of remaining time.
-1.735 = Validation score (-root_mean_squared_error)
39.36s = Training runtime
0.82s = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 318.23s of the 813.85s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7448 = Validation score (-root_mean_squared_error)
36.42s = Training runtime
0.06s = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L2 ... Training model for up to 278.9s of the 774.52s of remaining time.
-1.716 = Validation score (-root_mean_squared_error)
15.51s = Training runtime
0.7s = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L2 ... Training model for up to 260.04s of the 755.66s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7874 = Validation score (-root_mean_squared_error)
69.98s = Training runtime
0.44s = Validation runtime
Fitting model: XGBoost_BAG_L2 ... Training model for up to 187.12s of the 682.74s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.759 = Validation score (-root_mean_squared_error)
16.2s = Training runtime
0.1s = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 167.8s of the 663.42s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.8164 = Validation score (-root_mean_squared_error)
67.66s = Training runtime
0.18s = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 97.15s of the 592.77s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7549 = Validation score (-root_mean_squared_error)
23.66s = Training runtime
0.23s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.0s of the 565.96s of remaining time.
-1.71 = Validation score (-root_mean_squared_error)
0.54s = Training runtime
0.0s = Validation runtime
Fitting 9 L3 models ...
Fitting model: LightGBMXT_BAG_L3 ... Training model for up to 376.8s of the 565.32s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7416 = Validation score (-root_mean_squared_error)
12.17s = Training runtime
0.09s = Validation runtime
Fitting model: LightGBM_BAG_L3 ... Training model for up to 361.58s of the 550.1s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7309 = Validation score (-root_mean_squared_error)
12.45s = Training runtime
0.09s = Validation runtime
Fitting model: RandomForestMSE_BAG_L3 ... Training model for up to 346.12s of the 534.64s of remaining time.
-1.7489 = Validation score (-root_mean_squared_error)
36.35s = Training runtime
0.74s = Validation runtime
Fitting model: CatBoost_BAG_L3 ... Training model for up to 306.53s of the 495.05s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7267 = Validation score (-root_mean_squared_error)
17.89s = Training runtime
0.06s = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L3 ... Training model for up to 285.66s of the 474.18s of remaining time.
-1.7405 = Validation score (-root_mean_squared_error)
14.95s = Training runtime
0.65s = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L3 ... Training model for up to 267.47s of the 455.99s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7887 = Validation score (-root_mean_squared_error)
64.19s = Training runtime
0.61s = Validation runtime
Fitting model: XGBoost_BAG_L3 ... Training model for up to 200.42s of the 388.94s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7409 = Validation score (-root_mean_squared_error)
13.34s = Training runtime
0.09s = Validation runtime
Fitting model: NeuralNetTorch_BAG_L3 ... Training model for up to 184.02s of the 372.54s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.755 = Validation score (-root_mean_squared_error)
66.3s = Training runtime
0.18s = Validation runtime
Fitting model: LightGBMLarge_BAG_L3 ... Training model for up to 114.77s of the 303.3s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7529 = Validation score (-root_mean_squared_error)
20.98s = Training runtime
0.17s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L4 ... Training model for up to 360.0s of the 279.2s of remaining time.
-1.7191 = Validation score (-root_mean_squared_error)
0.55s = Training runtime
0.0s = Validation runtime
Fitting 9 L4 models ...
Fitting model: LightGBMXT_BAG_L4 ... Training model for up to 278.57s of the 278.55s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7504 = Validation score (-root_mean_squared_error)
12.15s = Training runtime
0.1s = Validation runtime
Fitting model: LightGBM_BAG_L4 ... Training model for up to 263.38s of the 263.36s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7427 = Validation score (-root_mean_squared_error)
12.65s = Training runtime
0.09s = Validation runtime
Fitting model: RandomForestMSE_BAG_L4 ... Training model for up to 247.68s of the 247.64s of remaining time.
-1.7599 = Validation score (-root_mean_squared_error)
36.55s = Training runtime
0.73s = Validation runtime
Fitting model: CatBoost_BAG_L4 ... Training model for up to 207.49s of the 207.46s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7329 = Validation score (-root_mean_squared_error)
16.83s = Training runtime
0.06s = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L4 ... Training model for up to 187.59s of the 187.56s of remaining time.
-1.7598 = Validation score (-root_mean_squared_error)
14.89s = Training runtime
0.74s = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L4 ... Training model for up to 169.39s of the 169.36s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7967 = Validation score (-root_mean_squared_error)
63.45s = Training runtime
0.42s = Validation runtime
Fitting model: XGBoost_BAG_L4 ... Training model for up to 102.9s of the 102.88s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7534 = Validation score (-root_mean_squared_error)
12.91s = Training runtime
0.11s = Validation runtime
Fitting model: NeuralNetTorch_BAG_L4 ... Training model for up to 86.89s of the 86.87s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7605 = Validation score (-root_mean_squared_error)
62.97s = Training runtime
0.19s = Validation runtime
Fitting model: LightGBMLarge_BAG_L4 ... Training model for up to 21.02s of the 21.0s of remaining time.
Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
-1.7696 = Validation score (-root_mean_squared_error)
19.92s = Training runtime
0.16s = Validation runtime
Completed 1/2 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L5 ... Training model for up to 360.0s of the -2.04s of remaining time.
-1.7293 = Validation score (-root_mean_squared_error)
0.61s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 1202.9s ... Best model: "WeightedEnsemble_L3"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20221025_030847/")
predictor.fit_summary()
*** Summary of fit() ***
Estimated performance of each model:
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L3 -1.709967 5.543761 479.201571 0.000749 0.535699 3 True 22
1 ExtraTreesMSE_BAG_L2 -1.716013 3.671685 293.250168 0.702600 15.511165 2 True 17
2 WeightedEnsemble_L4 -1.719143 7.558237 671.496551 0.000841 0.545253 4 True 32
3 CatBoost_BAG_L3 -1.726740 5.987127 593.862323 0.064462 17.888313 3 True 26
4 WeightedEnsemble_L5 -1.729316 10.520802 991.985767 0.001301 0.611000 5 True 42
5 LightGBM_BAG_L3 -1.730860 6.009135 588.421698 0.086470 12.447689 3 True 24
6 CatBoost_BAG_L4 -1.732903 8.663546 851.421007 0.055093 16.833139 4 True 36
7 RandomForestMSE_BAG_L2 -1.734991 3.786224 317.101645 0.817139 39.362643 2 True 15
8 ExtraTreesMSE_BAG_L3 -1.740521 6.575164 590.926323 0.652499 14.952313 3 True 27
9 XGBoost_BAG_L3 -1.740927 6.012204 589.313933 0.089540 13.339923 3 True 29
10 LightGBMXT_BAG_L3 -1.741577 6.013078 588.139951 0.090413 12.165942 3 True 23
11 LightGBM_BAG_L4 -1.742679 8.694140 847.234471 0.085687 12.646603 4 True 34
12 CatBoost_BAG_L2 -1.744837 3.028521 314.161335 0.059436 36.422333 2 True 16
13 LightGBMXT_BAG_L2 -1.746202 3.292388 293.729514 0.323303 15.990512 2 True 13
14 RandomForestMSE_BAG_L3 -1.748935 6.664425 612.323058 0.741760 36.349049 3 True 25
15 LightGBMXT_BAG_L4 -1.750369 8.707300 846.735910 0.098847 12.148042 4 True 33
16 LightGBMLarge_BAG_L3 -1.752859 6.091848 596.949533 0.169183 20.975524 3 True 31
17 XGBoost_BAG_L4 -1.753387 8.715269 847.496465 0.106816 12.908597 4 True 39
18 LightGBM_BAG_L2 -1.753711 3.071025 291.189458 0.101940 13.450456 2 True 14
19 LightGBMLarge_BAG_L2 -1.754917 3.195825 301.397104 0.226740 23.658102 2 True 21
20 NeuralNetTorch_BAG_L3 -1.755008 6.103540 642.277616 0.180875 66.303606 3 True 30
21 XGBoost_BAG_L2 -1.758962 3.064213 293.935433 0.095128 16.196430 2 True 19
22 ExtraTreesMSE_BAG_L4 -1.759758 9.350366 849.475112 0.741913 14.887244 4 True 37
23 RandomForestMSE_BAG_L4 -1.759897 9.340916 871.133245 0.732463 36.545377 4 True 35
24 NeuralNetTorch_BAG_L4 -1.760546 8.797529 897.553808 0.189076 62.965940 4 True 40
25 WeightedEnsemble_L2 -1.766681 1.102346 227.594674 0.000809 0.630754 2 True 12
26 LightGBMLarge_BAG_L4 -1.769560 8.763754 854.503373 0.155301 19.915505 4 True 41
27 NeuralNetFastAI_BAG_L2 -1.787441 3.413795 347.721117 0.444709 69.982115 2 True 18
28 NeuralNetFastAI_BAG_L3 -1.788667 6.533250 640.165508 0.610585 64.191499 3 True 28
29 LightGBMXT_BAG_L1 -1.792640 0.262771 13.170186 0.262771 13.170186 1 True 3
30 NeuralNetFastAI_BAG_L1 -1.793598 0.407073 71.987040 0.407073 71.987040 1 True 8
31 CatBoost_BAG_L1 -1.796059 0.065646 29.626354 0.065646 29.626354 1 True 6
32 NeuralNetFastAI_BAG_L4 -1.796692 9.032562 898.036599 0.424109 63.448731 4 True 38
33 LightGBM_BAG_L1 -1.806878 0.187209 11.854438 0.187209 11.854438 1 True 4
34 NeuralNetTorch_BAG_L2 -1.816365 3.151669 345.400254 0.182584 67.661252 2 True 20
35 LightGBMLarge_BAG_L1 -1.831872 0.212358 15.512873 0.212358 15.512873 1 True 11
36 NeuralNetTorch_BAG_L1 -1.832035 0.178838 100.325902 0.178838 100.325902 1 True 10
37 XGBoost_BAG_L1 -1.852677 0.143761 11.979749 0.143761 11.979749 1 True 9
38 RandomForestMSE_BAG_L1 -1.971853 0.676349 14.071911 0.676349 14.071911 1 True 5
39 ExtraTreesMSE_BAG_L1 -1.981529 0.623186 9.176623 0.623186 9.176623 1 True 7
40 KNeighborsUnif_BAG_L1 -5.788364 0.103132 0.019244 0.103132 0.019244 1 True 1
41 KNeighborsDist_BAG_L1 -5.839625 0.108763 0.014681 0.108763 0.014681 1 True 2
Number of models trained: 42
Types of models trained:
{'StackerEnsembleModel_TabularNeuralNetTorch', 'StackerEnsembleModel_LGB', 'StackerEnsembleModel_NNFastAiTabular', 'StackerEnsembleModel_KNN', 'StackerEnsembleModel_XT', 'StackerEnsembleModel_RF', 'StackerEnsembleModel_CatBoost', 'WeightedEnsembleModel', 'StackerEnsembleModel_XGBoost'}
Bagging used: True (with 5 folds)
Multi-layer stack-ensembling used: True (with 5 levels)
Feature Metadata (Processed):
(raw dtype, special dtypes):
('float', []) : 2 | ['humidity', 'windspeed_transformed']
('int', ['bool']) : 42 | ['holiday__0', 'holiday__1', 'workingday__0', 'workingday__1', 'weather_clear', ...]
Plot summary of models saved to file: AutogluonModels/ag-20221025_030847/SummaryOfModels.html
*** End of fit() summary ***
{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L1': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
'ExtraTreesMSE_BAG_L1': 'StackerEnsembleModel_XT',
'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L1': 'StackerEnsembleModel_TabularNeuralNetTorch',
'LightGBMLarge_BAG_L1': 'StackerEnsembleModel_LGB',
'WeightedEnsemble_L2': 'WeightedEnsembleModel',
'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L2': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
'ExtraTreesMSE_BAG_L2': 'StackerEnsembleModel_XT',
'NeuralNetFastAI_BAG_L2': 'StackerEnsembleModel_NNFastAiTabular',
'XGBoost_BAG_L2': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L2': 'StackerEnsembleModel_TabularNeuralNetTorch',
'LightGBMLarge_BAG_L2': 'StackerEnsembleModel_LGB',
'WeightedEnsemble_L3': 'WeightedEnsembleModel',
'LightGBMXT_BAG_L3': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L3': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L3': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L3': 'StackerEnsembleModel_CatBoost',
'ExtraTreesMSE_BAG_L3': 'StackerEnsembleModel_XT',
'NeuralNetFastAI_BAG_L3': 'StackerEnsembleModel_NNFastAiTabular',
'XGBoost_BAG_L3': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L3': 'StackerEnsembleModel_TabularNeuralNetTorch',
'LightGBMLarge_BAG_L3': 'StackerEnsembleModel_LGB',
'WeightedEnsemble_L4': 'WeightedEnsembleModel',
'LightGBMXT_BAG_L4': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L4': 'StackerEnsembleModel_LGB',
'RandomForestMSE_BAG_L4': 'StackerEnsembleModel_RF',
'CatBoost_BAG_L4': 'StackerEnsembleModel_CatBoost',
'ExtraTreesMSE_BAG_L4': 'StackerEnsembleModel_XT',
'NeuralNetFastAI_BAG_L4': 'StackerEnsembleModel_NNFastAiTabular',
'XGBoost_BAG_L4': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L4': 'StackerEnsembleModel_TabularNeuralNetTorch',
'LightGBMLarge_BAG_L4': 'StackerEnsembleModel_LGB',
'WeightedEnsemble_L5': 'WeightedEnsembleModel'},
'model_performance': {'KNeighborsUnif_BAG_L1': -5.788363752090337,
'KNeighborsDist_BAG_L1': -5.839624671702201,
'LightGBMXT_BAG_L1': -1.7926403278299319,
'LightGBM_BAG_L1': -1.8068775600415437,
'RandomForestMSE_BAG_L1': -1.9718527514864965,
'CatBoost_BAG_L1': -1.7960594296431143,
'ExtraTreesMSE_BAG_L1': -1.9815287684514555,
'NeuralNetFastAI_BAG_L1': -1.7935981844138953,
'XGBoost_BAG_L1': -1.8526766165420463,
'NeuralNetTorch_BAG_L1': -1.8320348726680848,
'LightGBMLarge_BAG_L1': -1.8318720539727393,
'WeightedEnsemble_L2': -1.7666811597912742,
'LightGBMXT_BAG_L2': -1.746201770548838,
'LightGBM_BAG_L2': -1.7537111211648773,
'RandomForestMSE_BAG_L2': -1.734991122087423,
'CatBoost_BAG_L2': -1.7448369478159456,
'ExtraTreesMSE_BAG_L2': -1.7160129444388517,
'NeuralNetFastAI_BAG_L2': -1.7874406521145296,
'XGBoost_BAG_L2': -1.758962117749964,
'NeuralNetTorch_BAG_L2': -1.8163651844976538,
'LightGBMLarge_BAG_L2': -1.7549169686761874,
'WeightedEnsemble_L3': -1.7099665686162195,
'LightGBMXT_BAG_L3': -1.7415768246480805,
'LightGBM_BAG_L3': -1.7308603395322157,
'RandomForestMSE_BAG_L3': -1.7489352868569021,
'CatBoost_BAG_L3': -1.726739967562241,
'ExtraTreesMSE_BAG_L3': -1.740520645275114,
'NeuralNetFastAI_BAG_L3': -1.788666818450383,
'XGBoost_BAG_L3': -1.7409272449100142,
'NeuralNetTorch_BAG_L3': -1.7550083334304167,
'LightGBMLarge_BAG_L3': -1.752859086375242,
'WeightedEnsemble_L4': -1.7191431598742484,
'LightGBMXT_BAG_L4': -1.7503686443521544,
'LightGBM_BAG_L4': -1.7426789052544978,
'RandomForestMSE_BAG_L4': -1.7598967684582933,
'CatBoost_BAG_L4': -1.7329033368278717,
'ExtraTreesMSE_BAG_L4': -1.7597577628385301,
'NeuralNetFastAI_BAG_L4': -1.7966921657392099,
'XGBoost_BAG_L4': -1.7533867688411557,
'NeuralNetTorch_BAG_L4': -1.7605460098780552,
'LightGBMLarge_BAG_L4': -1.769559772926085,
'WeightedEnsemble_L5': -1.7293162311472128},
'model_best': 'WeightedEnsemble_L3',
'model_paths': {'KNeighborsUnif_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/KNeighborsUnif_BAG_L1/',
'KNeighborsDist_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/KNeighborsDist_BAG_L1/',
'LightGBMXT_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/LightGBMXT_BAG_L1/',
'LightGBM_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/LightGBM_BAG_L1/',
'RandomForestMSE_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/RandomForestMSE_BAG_L1/',
'CatBoost_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/CatBoost_BAG_L1/',
'ExtraTreesMSE_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/ExtraTreesMSE_BAG_L1/',
'NeuralNetFastAI_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/NeuralNetFastAI_BAG_L1/',
'XGBoost_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/XGBoost_BAG_L1/',
'NeuralNetTorch_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/NeuralNetTorch_BAG_L1/',
'LightGBMLarge_BAG_L1': 'AutogluonModels/ag-20221025_030847/models/LightGBMLarge_BAG_L1/',
'WeightedEnsemble_L2': 'AutogluonModels/ag-20221025_030847/models/WeightedEnsemble_L2/',
'LightGBMXT_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/LightGBMXT_BAG_L2/',
'LightGBM_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/LightGBM_BAG_L2/',
'RandomForestMSE_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/RandomForestMSE_BAG_L2/',
'CatBoost_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/CatBoost_BAG_L2/',
'ExtraTreesMSE_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/ExtraTreesMSE_BAG_L2/',
'NeuralNetFastAI_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/NeuralNetFastAI_BAG_L2/',
'XGBoost_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/XGBoost_BAG_L2/',
'NeuralNetTorch_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/NeuralNetTorch_BAG_L2/',
'LightGBMLarge_BAG_L2': 'AutogluonModels/ag-20221025_030847/models/LightGBMLarge_BAG_L2/',
'WeightedEnsemble_L3': 'AutogluonModels/ag-20221025_030847/models/WeightedEnsemble_L3/',
'LightGBMXT_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/LightGBMXT_BAG_L3/',
'LightGBM_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/LightGBM_BAG_L3/',
'RandomForestMSE_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/RandomForestMSE_BAG_L3/',
'CatBoost_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/CatBoost_BAG_L3/',
'ExtraTreesMSE_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/ExtraTreesMSE_BAG_L3/',
'NeuralNetFastAI_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/NeuralNetFastAI_BAG_L3/',
'XGBoost_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/XGBoost_BAG_L3/',
'NeuralNetTorch_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/NeuralNetTorch_BAG_L3/',
'LightGBMLarge_BAG_L3': 'AutogluonModels/ag-20221025_030847/models/LightGBMLarge_BAG_L3/',
'WeightedEnsemble_L4': 'AutogluonModels/ag-20221025_030847/models/WeightedEnsemble_L4/',
'LightGBMXT_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/LightGBMXT_BAG_L4/',
'LightGBM_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/LightGBM_BAG_L4/',
'RandomForestMSE_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/RandomForestMSE_BAG_L4/',
'CatBoost_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/CatBoost_BAG_L4/',
'ExtraTreesMSE_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/ExtraTreesMSE_BAG_L4/',
'NeuralNetFastAI_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/NeuralNetFastAI_BAG_L4/',
'XGBoost_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/XGBoost_BAG_L4/',
'NeuralNetTorch_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/NeuralNetTorch_BAG_L4/',
'LightGBMLarge_BAG_L4': 'AutogluonModels/ag-20221025_030847/models/LightGBMLarge_BAG_L4/',
'WeightedEnsemble_L5': 'AutogluonModels/ag-20221025_030847/models/WeightedEnsemble_L5/'},
'model_fit_times': {'KNeighborsUnif_BAG_L1': 0.01924443244934082,
'KNeighborsDist_BAG_L1': 0.014680862426757812,
'LightGBMXT_BAG_L1': 13.170186281204224,
'LightGBM_BAG_L1': 11.854438066482544,
'RandomForestMSE_BAG_L1': 14.071911334991455,
'CatBoost_BAG_L1': 29.62635350227356,
'ExtraTreesMSE_BAG_L1': 9.176623106002808,
'NeuralNetFastAI_BAG_L1': 71.98704028129578,
'XGBoost_BAG_L1': 11.979748964309692,
'NeuralNetTorch_BAG_L1': 100.32590222358704,
'LightGBMLarge_BAG_L1': 15.512873411178589,
'WeightedEnsemble_L2': 0.6307535171508789,
'LightGBMXT_BAG_L2': 15.990511655807495,
'LightGBM_BAG_L2': 13.450455904006958,
'RandomForestMSE_BAG_L2': 39.362643003463745,
'CatBoost_BAG_L2': 36.422332763671875,
'ExtraTreesMSE_BAG_L2': 15.511165142059326,
'NeuralNetFastAI_BAG_L2': 69.98211455345154,
'XGBoost_BAG_L2': 16.196430206298828,
'NeuralNetTorch_BAG_L2': 67.66125178337097,
'LightGBMLarge_BAG_L2': 23.658101797103882,
'WeightedEnsemble_L3': 0.5356993675231934,
'LightGBMXT_BAG_L3': 12.165941715240479,
'LightGBM_BAG_L3': 12.447689056396484,
'RandomForestMSE_BAG_L3': 36.34904909133911,
'CatBoost_BAG_L3': 17.88831329345703,
'ExtraTreesMSE_BAG_L3': 14.952313423156738,
'NeuralNetFastAI_BAG_L3': 64.19149851799011,
'XGBoost_BAG_L3': 13.33992338180542,
'NeuralNetTorch_BAG_L3': 66.30360627174377,
'LightGBMLarge_BAG_L3': 20.975523948669434,
'WeightedEnsemble_L4': 0.5452532768249512,
'LightGBMXT_BAG_L4': 12.14804220199585,
'LightGBM_BAG_L4': 12.646603107452393,
'RandomForestMSE_BAG_L4': 36.545376777648926,
'CatBoost_BAG_L4': 16.833138942718506,
'ExtraTreesMSE_BAG_L4': 14.887243747711182,
'NeuralNetFastAI_BAG_L4': 63.44873118400574,
'XGBoost_BAG_L4': 12.908596754074097,
'NeuralNetTorch_BAG_L4': 62.96593976020813,
'LightGBMLarge_BAG_L4': 19.915504693984985,
'WeightedEnsemble_L5': 0.610999584197998},
'model_pred_times': {'KNeighborsUnif_BAG_L1': 0.10313153266906738,
'KNeighborsDist_BAG_L1': 0.10876274108886719,
'LightGBMXT_BAG_L1': 0.2627706527709961,
'LightGBM_BAG_L1': 0.1872093677520752,
'RandomForestMSE_BAG_L1': 0.6763486862182617,
'CatBoost_BAG_L1': 0.06564593315124512,
'ExtraTreesMSE_BAG_L1': 0.6231861114501953,
'NeuralNetFastAI_BAG_L1': 0.4070732593536377,
'XGBoost_BAG_L1': 0.14376091957092285,
'NeuralNetTorch_BAG_L1': 0.17883753776550293,
'LightGBMLarge_BAG_L1': 0.2123584747314453,
'WeightedEnsemble_L2': 0.0008094310760498047,
'LightGBMXT_BAG_L2': 0.32330322265625,
'LightGBM_BAG_L2': 0.10194015502929688,
'RandomForestMSE_BAG_L2': 0.8171391487121582,
'CatBoost_BAG_L2': 0.05943584442138672,
'ExtraTreesMSE_BAG_L2': 0.7026000022888184,
'NeuralNetFastAI_BAG_L2': 0.44470930099487305,
'XGBoost_BAG_L2': 0.09512805938720703,
'NeuralNetTorch_BAG_L2': 0.18258404731750488,
'LightGBMLarge_BAG_L2': 0.22673988342285156,
'WeightedEnsemble_L3': 0.0007486343383789062,
'LightGBMXT_BAG_L3': 0.09041285514831543,
'LightGBM_BAG_L3': 0.08646988868713379,
'RandomForestMSE_BAG_L3': 0.74176025390625,
'CatBoost_BAG_L3': 0.06446218490600586,
'ExtraTreesMSE_BAG_L3': 0.6524994373321533,
'NeuralNetFastAI_BAG_L3': 0.6105854511260986,
'XGBoost_BAG_L3': 0.0895395278930664,
'NeuralNetTorch_BAG_L3': 0.18087530136108398,
'LightGBMLarge_BAG_L3': 0.16918325424194336,
'WeightedEnsemble_L4': 0.0008411407470703125,
'LightGBMXT_BAG_L4': 0.0988473892211914,
'LightGBM_BAG_L4': 0.08568668365478516,
'RandomForestMSE_BAG_L4': 0.7324628829956055,
'CatBoost_BAG_L4': 0.05509328842163086,
'ExtraTreesMSE_BAG_L4': 0.7419130802154541,
'NeuralNetFastAI_BAG_L4': 0.42410850524902344,
'XGBoost_BAG_L4': 0.10681629180908203,
'NeuralNetTorch_BAG_L4': 0.18907594680786133,
'LightGBMLarge_BAG_L4': 0.15530061721801758,
'WeightedEnsemble_L5': 0.001300811767578125},
'num_bag_folds': 5,
'max_stack_level': 5,
'model_hyperparams': {'KNeighborsUnif_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'KNeighborsDist_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'LightGBMXT_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'ExtraTreesMSE_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'NeuralNetFastAI_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMLarge_BAG_L1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L2': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMXT_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'ExtraTreesMSE_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'NeuralNetFastAI_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMLarge_BAG_L2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L3': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMXT_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'ExtraTreesMSE_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'NeuralNetFastAI_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMLarge_BAG_L3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L4': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMXT_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'RandomForestMSE_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'CatBoost_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'ExtraTreesMSE_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True,
'use_child_oof': True},
'NeuralNetFastAI_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBMLarge_BAG_L4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L5': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True}},
'leaderboard': model score_val pred_time_val fit_time \
0 WeightedEnsemble_L3 -1.709967 5.543761 479.201571
1 ExtraTreesMSE_BAG_L2 -1.716013 3.671685 293.250168
2 WeightedEnsemble_L4 -1.719143 7.558237 671.496551
3 CatBoost_BAG_L3 -1.726740 5.987127 593.862323
4 WeightedEnsemble_L5 -1.729316 10.520802 991.985767
5 LightGBM_BAG_L3 -1.730860 6.009135 588.421698
6 CatBoost_BAG_L4 -1.732903 8.663546 851.421007
7 RandomForestMSE_BAG_L2 -1.734991 3.786224 317.101645
8 ExtraTreesMSE_BAG_L3 -1.740521 6.575164 590.926323
9 XGBoost_BAG_L3 -1.740927 6.012204 589.313933
10 LightGBMXT_BAG_L3 -1.741577 6.013078 588.139951
11 LightGBM_BAG_L4 -1.742679 8.694140 847.234471
12 CatBoost_BAG_L2 -1.744837 3.028521 314.161335
13 LightGBMXT_BAG_L2 -1.746202 3.292388 293.729514
14 RandomForestMSE_BAG_L3 -1.748935 6.664425 612.323058
15 LightGBMXT_BAG_L4 -1.750369 8.707300 846.735910
16 LightGBMLarge_BAG_L3 -1.752859 6.091848 596.949533
17 XGBoost_BAG_L4 -1.753387 8.715269 847.496465
18 LightGBM_BAG_L2 -1.753711 3.071025 291.189458
19 LightGBMLarge_BAG_L2 -1.754917 3.195825 301.397104
20 NeuralNetTorch_BAG_L3 -1.755008 6.103540 642.277616
21 XGBoost_BAG_L2 -1.758962 3.064213 293.935433
22 ExtraTreesMSE_BAG_L4 -1.759758 9.350366 849.475112
23 RandomForestMSE_BAG_L4 -1.759897 9.340916 871.133245
24 NeuralNetTorch_BAG_L4 -1.760546 8.797529 897.553808
25 WeightedEnsemble_L2 -1.766681 1.102346 227.594674
26 LightGBMLarge_BAG_L4 -1.769560 8.763754 854.503373
27 NeuralNetFastAI_BAG_L2 -1.787441 3.413795 347.721117
28 NeuralNetFastAI_BAG_L3 -1.788667 6.533250 640.165508
29 LightGBMXT_BAG_L1 -1.792640 0.262771 13.170186
30 NeuralNetFastAI_BAG_L1 -1.793598 0.407073 71.987040
31 CatBoost_BAG_L1 -1.796059 0.065646 29.626354
32 NeuralNetFastAI_BAG_L4 -1.796692 9.032562 898.036599
33 LightGBM_BAG_L1 -1.806878 0.187209 11.854438
34 NeuralNetTorch_BAG_L2 -1.816365 3.151669 345.400254
35 LightGBMLarge_BAG_L1 -1.831872 0.212358 15.512873
36 NeuralNetTorch_BAG_L1 -1.832035 0.178838 100.325902
37 XGBoost_BAG_L1 -1.852677 0.143761 11.979749
38 RandomForestMSE_BAG_L1 -1.971853 0.676349 14.071911
39 ExtraTreesMSE_BAG_L1 -1.981529 0.623186 9.176623
40 KNeighborsUnif_BAG_L1 -5.788364 0.103132 0.019244
41 KNeighborsDist_BAG_L1 -5.839625 0.108763 0.014681
pred_time_val_marginal fit_time_marginal stack_level can_infer \
0 0.000749 0.535699 3 True
1 0.702600 15.511165 2 True
2 0.000841 0.545253 4 True
3 0.064462 17.888313 3 True
4 0.001301 0.611000 5 True
5 0.086470 12.447689 3 True
6 0.055093 16.833139 4 True
7 0.817139 39.362643 2 True
8 0.652499 14.952313 3 True
9 0.089540 13.339923 3 True
10 0.090413 12.165942 3 True
11 0.085687 12.646603 4 True
12 0.059436 36.422333 2 True
13 0.323303 15.990512 2 True
14 0.741760 36.349049 3 True
15 0.098847 12.148042 4 True
16 0.169183 20.975524 3 True
17 0.106816 12.908597 4 True
18 0.101940 13.450456 2 True
19 0.226740 23.658102 2 True
20 0.180875 66.303606 3 True
21 0.095128 16.196430 2 True
22 0.741913 14.887244 4 True
23 0.732463 36.545377 4 True
24 0.189076 62.965940 4 True
25 0.000809 0.630754 2 True
26 0.155301 19.915505 4 True
27 0.444709 69.982115 2 True
28 0.610585 64.191499 3 True
29 0.262771 13.170186 1 True
30 0.407073 71.987040 1 True
31 0.065646 29.626354 1 True
32 0.424109 63.448731 4 True
33 0.187209 11.854438 1 True
34 0.182584 67.661252 2 True
35 0.212358 15.512873 1 True
36 0.178838 100.325902 1 True
37 0.143761 11.979749 1 True
38 0.676349 14.071911 1 True
39 0.623186 9.176623 1 True
40 0.103132 0.019244 1 True
41 0.108763 0.014681 1 True
fit_order
0 22
1 17
2 32
3 26
4 42
5 24
6 36
7 15
8 27
9 29
10 23
11 34
12 16
13 13
14 25
15 33
16 31
17 39
18 14
19 21
20 30
21 19
22 37
23 35
24 40
25 12
26 41
27 18
28 28
29 3
30 8
31 6
32 38
33 4
34 20
35 11
36 10
37 9
38 5
39 7
40 1
41 2 }
predictor.leaderboard()
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order 0 WeightedEnsemble_L3 -1.709967 5.543761 479.201571 0.000749 0.535699 3 True 22 1 ExtraTreesMSE_BAG_L2 -1.716013 3.671685 293.250168 0.702600 15.511165 2 True 17 2 WeightedEnsemble_L4 -1.719143 7.558237 671.496551 0.000841 0.545253 4 True 32 3 CatBoost_BAG_L3 -1.726740 5.987127 593.862323 0.064462 17.888313 3 True 26 4 WeightedEnsemble_L5 -1.729316 10.520802 991.985767 0.001301 0.611000 5 True 42 5 LightGBM_BAG_L3 -1.730860 6.009135 588.421698 0.086470 12.447689 3 True 24 6 CatBoost_BAG_L4 -1.732903 8.663546 851.421007 0.055093 16.833139 4 True 36 7 RandomForestMSE_BAG_L2 -1.734991 3.786224 317.101645 0.817139 39.362643 2 True 15 8 ExtraTreesMSE_BAG_L3 -1.740521 6.575164 590.926323 0.652499 14.952313 3 True 27 9 XGBoost_BAG_L3 -1.740927 6.012204 589.313933 0.089540 13.339923 3 True 29 10 LightGBMXT_BAG_L3 -1.741577 6.013078 588.139951 0.090413 12.165942 3 True 23 11 LightGBM_BAG_L4 -1.742679 8.694140 847.234471 0.085687 12.646603 4 True 34 12 CatBoost_BAG_L2 -1.744837 3.028521 314.161335 0.059436 36.422333 2 True 16 13 LightGBMXT_BAG_L2 -1.746202 3.292388 293.729514 0.323303 15.990512 2 True 13 14 RandomForestMSE_BAG_L3 -1.748935 6.664425 612.323058 0.741760 36.349049 3 True 25 15 LightGBMXT_BAG_L4 -1.750369 8.707300 846.735910 0.098847 12.148042 4 True 33 16 LightGBMLarge_BAG_L3 -1.752859 6.091848 596.949533 0.169183 20.975524 3 True 31 17 XGBoost_BAG_L4 -1.753387 8.715269 847.496465 0.106816 12.908597 4 True 39 18 LightGBM_BAG_L2 -1.753711 3.071025 291.189458 0.101940 13.450456 2 True 14 19 LightGBMLarge_BAG_L2 -1.754917 3.195825 301.397104 0.226740 23.658102 2 True 21 20 NeuralNetTorch_BAG_L3 -1.755008 6.103540 642.277616 0.180875 66.303606 3 True 30 21 XGBoost_BAG_L2 -1.758962 3.064213 293.935433 0.095128 16.196430 2 True 19 22 ExtraTreesMSE_BAG_L4 -1.759758 9.350366 849.475112 0.741913 14.887244 4 True 37 23 RandomForestMSE_BAG_L4 -1.759897 9.340916 871.133245 0.732463 36.545377 4 True 35 24 NeuralNetTorch_BAG_L4 -1.760546 8.797529 897.553808 0.189076 62.965940 4 True 40 25 WeightedEnsemble_L2 -1.766681 1.102346 227.594674 0.000809 0.630754 2 True 12 26 LightGBMLarge_BAG_L4 -1.769560 8.763754 854.503373 0.155301 19.915505 4 True 41 27 NeuralNetFastAI_BAG_L2 -1.787441 3.413795 347.721117 0.444709 69.982115 2 True 18 28 NeuralNetFastAI_BAG_L3 -1.788667 6.533250 640.165508 0.610585 64.191499 3 True 28 29 LightGBMXT_BAG_L1 -1.792640 0.262771 13.170186 0.262771 13.170186 1 True 3 30 NeuralNetFastAI_BAG_L1 -1.793598 0.407073 71.987040 0.407073 71.987040 1 True 8 31 CatBoost_BAG_L1 -1.796059 0.065646 29.626354 0.065646 29.626354 1 True 6 32 NeuralNetFastAI_BAG_L4 -1.796692 9.032562 898.036599 0.424109 63.448731 4 True 38 33 LightGBM_BAG_L1 -1.806878 0.187209 11.854438 0.187209 11.854438 1 True 4 34 NeuralNetTorch_BAG_L2 -1.816365 3.151669 345.400254 0.182584 67.661252 2 True 20 35 LightGBMLarge_BAG_L1 -1.831872 0.212358 15.512873 0.212358 15.512873 1 True 11 36 NeuralNetTorch_BAG_L1 -1.832035 0.178838 100.325902 0.178838 100.325902 1 True 10 37 XGBoost_BAG_L1 -1.852677 0.143761 11.979749 0.143761 11.979749 1 True 9 38 RandomForestMSE_BAG_L1 -1.971853 0.676349 14.071911 0.676349 14.071911 1 True 5 39 ExtraTreesMSE_BAG_L1 -1.981529 0.623186 9.176623 0.623186 9.176623 1 True 7 40 KNeighborsUnif_BAG_L1 -5.788364 0.103132 0.019244 0.103132 0.019244 1 True 1 41 KNeighborsDist_BAG_L1 -5.839625 0.108763 0.014681 0.108763 0.014681 1 True 2
| model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | WeightedEnsemble_L3 | -1.709967 | 5.543761 | 479.201571 | 0.000749 | 0.535699 | 3 | True | 22 |
| 1 | ExtraTreesMSE_BAG_L2 | -1.716013 | 3.671685 | 293.250168 | 0.702600 | 15.511165 | 2 | True | 17 |
| 2 | WeightedEnsemble_L4 | -1.719143 | 7.558237 | 671.496551 | 0.000841 | 0.545253 | 4 | True | 32 |
| 3 | CatBoost_BAG_L3 | -1.726740 | 5.987127 | 593.862323 | 0.064462 | 17.888313 | 3 | True | 26 |
| 4 | WeightedEnsemble_L5 | -1.729316 | 10.520802 | 991.985767 | 0.001301 | 0.611000 | 5 | True | 42 |
| 5 | LightGBM_BAG_L3 | -1.730860 | 6.009135 | 588.421698 | 0.086470 | 12.447689 | 3 | True | 24 |
| 6 | CatBoost_BAG_L4 | -1.732903 | 8.663546 | 851.421007 | 0.055093 | 16.833139 | 4 | True | 36 |
| 7 | RandomForestMSE_BAG_L2 | -1.734991 | 3.786224 | 317.101645 | 0.817139 | 39.362643 | 2 | True | 15 |
| 8 | ExtraTreesMSE_BAG_L3 | -1.740521 | 6.575164 | 590.926323 | 0.652499 | 14.952313 | 3 | True | 27 |
| 9 | XGBoost_BAG_L3 | -1.740927 | 6.012204 | 589.313933 | 0.089540 | 13.339923 | 3 | True | 29 |
| 10 | LightGBMXT_BAG_L3 | -1.741577 | 6.013078 | 588.139951 | 0.090413 | 12.165942 | 3 | True | 23 |
| 11 | LightGBM_BAG_L4 | -1.742679 | 8.694140 | 847.234471 | 0.085687 | 12.646603 | 4 | True | 34 |
| 12 | CatBoost_BAG_L2 | -1.744837 | 3.028521 | 314.161335 | 0.059436 | 36.422333 | 2 | True | 16 |
| 13 | LightGBMXT_BAG_L2 | -1.746202 | 3.292388 | 293.729514 | 0.323303 | 15.990512 | 2 | True | 13 |
| 14 | RandomForestMSE_BAG_L3 | -1.748935 | 6.664425 | 612.323058 | 0.741760 | 36.349049 | 3 | True | 25 |
| 15 | LightGBMXT_BAG_L4 | -1.750369 | 8.707300 | 846.735910 | 0.098847 | 12.148042 | 4 | True | 33 |
| 16 | LightGBMLarge_BAG_L3 | -1.752859 | 6.091848 | 596.949533 | 0.169183 | 20.975524 | 3 | True | 31 |
| 17 | XGBoost_BAG_L4 | -1.753387 | 8.715269 | 847.496465 | 0.106816 | 12.908597 | 4 | True | 39 |
| 18 | LightGBM_BAG_L2 | -1.753711 | 3.071025 | 291.189458 | 0.101940 | 13.450456 | 2 | True | 14 |
| 19 | LightGBMLarge_BAG_L2 | -1.754917 | 3.195825 | 301.397104 | 0.226740 | 23.658102 | 2 | True | 21 |
| 20 | NeuralNetTorch_BAG_L3 | -1.755008 | 6.103540 | 642.277616 | 0.180875 | 66.303606 | 3 | True | 30 |
| 21 | XGBoost_BAG_L2 | -1.758962 | 3.064213 | 293.935433 | 0.095128 | 16.196430 | 2 | True | 19 |
| 22 | ExtraTreesMSE_BAG_L4 | -1.759758 | 9.350366 | 849.475112 | 0.741913 | 14.887244 | 4 | True | 37 |
| 23 | RandomForestMSE_BAG_L4 | -1.759897 | 9.340916 | 871.133245 | 0.732463 | 36.545377 | 4 | True | 35 |
| 24 | NeuralNetTorch_BAG_L4 | -1.760546 | 8.797529 | 897.553808 | 0.189076 | 62.965940 | 4 | True | 40 |
| 25 | WeightedEnsemble_L2 | -1.766681 | 1.102346 | 227.594674 | 0.000809 | 0.630754 | 2 | True | 12 |
| 26 | LightGBMLarge_BAG_L4 | -1.769560 | 8.763754 | 854.503373 | 0.155301 | 19.915505 | 4 | True | 41 |
| 27 | NeuralNetFastAI_BAG_L2 | -1.787441 | 3.413795 | 347.721117 | 0.444709 | 69.982115 | 2 | True | 18 |
| 28 | NeuralNetFastAI_BAG_L3 | -1.788667 | 6.533250 | 640.165508 | 0.610585 | 64.191499 | 3 | True | 28 |
| 29 | LightGBMXT_BAG_L1 | -1.792640 | 0.262771 | 13.170186 | 0.262771 | 13.170186 | 1 | True | 3 |
| 30 | NeuralNetFastAI_BAG_L1 | -1.793598 | 0.407073 | 71.987040 | 0.407073 | 71.987040 | 1 | True | 8 |
| 31 | CatBoost_BAG_L1 | -1.796059 | 0.065646 | 29.626354 | 0.065646 | 29.626354 | 1 | True | 6 |
| 32 | NeuralNetFastAI_BAG_L4 | -1.796692 | 9.032562 | 898.036599 | 0.424109 | 63.448731 | 4 | True | 38 |
| 33 | LightGBM_BAG_L1 | -1.806878 | 0.187209 | 11.854438 | 0.187209 | 11.854438 | 1 | True | 4 |
| 34 | NeuralNetTorch_BAG_L2 | -1.816365 | 3.151669 | 345.400254 | 0.182584 | 67.661252 | 2 | True | 20 |
| 35 | LightGBMLarge_BAG_L1 | -1.831872 | 0.212358 | 15.512873 | 0.212358 | 15.512873 | 1 | True | 11 |
| 36 | NeuralNetTorch_BAG_L1 | -1.832035 | 0.178838 | 100.325902 | 0.178838 | 100.325902 | 1 | True | 10 |
| 37 | XGBoost_BAG_L1 | -1.852677 | 0.143761 | 11.979749 | 0.143761 | 11.979749 | 1 | True | 9 |
| 38 | RandomForestMSE_BAG_L1 | -1.971853 | 0.676349 | 14.071911 | 0.676349 | 14.071911 | 1 | True | 5 |
| 39 | ExtraTreesMSE_BAG_L1 | -1.981529 | 0.623186 | 9.176623 | 0.623186 | 9.176623 | 1 | True | 7 |
| 40 | KNeighborsUnif_BAG_L1 | -5.788364 | 0.103132 | 0.019244 | 0.103132 | 0.019244 | 1 | True | 1 |
| 41 | KNeighborsDist_BAG_L1 | -5.839625 | 0.108763 | 0.014681 | 0.108763 | 0.014681 | 1 | True | 2 |
predictor.evaluate(preprocessed_train_data)
Evaluation: root_mean_squared_error on test data: -1.678828251114754
Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
"root_mean_squared_error": -1.678828251114754,
"mean_squared_error": -2.818464296741024,
"mean_absolute_error": -1.2709445158856278,
"r2": 0.9142395487039373,
"pearsonr": 0.956160230472608,
"median_absolute_error": -0.966761520482426
}
{'root_mean_squared_error': -1.678828251114754,
'mean_squared_error': -2.818464296741024,
'mean_absolute_error': -1.2709445158856278,
'r2': 0.9142395487039373,
'pearsonr': 0.956160230472608,
'median_absolute_error': -0.966761520482426}
train_preds = predictor.predict(preprocessed_train_data.iloc[:,:-1])
train_preds = inv_boxcox(train_preds, 0.32)
train_preds_log = np.log(train_preds+1)
train_counts_log = np.log(preprocessed_train_data.count_transformed+1)
train_errors = (np.sqrt(abs(train_counts_log - train_preds_log))).mean()
print(f"training_RMSLE: {train_errors}")
training_RMSLE: 1.4637948842502102
predictions = predictor.predict(preprocessed_test_data)
predictions.head()
0 4.059525 1 0.788865 2 0.788865 3 0.834664 4 0.834664 Name: count_transformed, dtype: float32
# Describe the `predictions` series to see if there are any negative values
predictions.describe()
count 6493.000000 mean 11.588725 std 5.427245 min 0.606701 25% 7.345763 50% 12.132465 75% 15.781278 max 22.973425 Name: count_transformed, dtype: float64
count values¶predictions_final = inv_boxcox(predictions, 0.32)
predictions_final.describe()
count 6493.000000 mean 183.061798 std 166.037354 min 1.741015 25% 43.754826 50% 141.897964 75% 277.324066 max 759.474854 Name: count_transformed, dtype: float64
# How many negative values do we have?
predictions_final[predictions_final<0]
Series([], Name: count_transformed, dtype: float32)
submission["count"] = predictions_final
submission.to_csv("submission.csv", index=False)
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "FeatEng with wind,count transformations no hyperopt"
100%|█████████████████████████████████████████| 188k/188k [00:00<00:00, 409kB/s] Successfully submitted to Bike Sharing Demand
!kaggle competitions submissions -c bike-sharing-demand | tail -n +1 | head -n 10
fileName date description status publicScore privateScore -------------- ------------------- --------------------------------------------------- -------- ----------- ------------ submission.csv 2022-10-25 03:33:32 FeatEng with wind,count transformations no hyperopt complete 0.52900 0.52900 submission.csv 2022-10-25 02:52:44 first raw submission complete 1.76328 1.76328 submission.csv 2022-10-25 02:20:30 first raw submission complete 1.79318 1.79318 submission.csv 2022-10-25 01:08:27 new feats hpo log count complete 0.51762 0.51762 submission.csv 2022-10-25 01:07:45 new feats hpo log count complete 0.83696 0.83696 submission.csv 2022-10-25 00:43:35 new feats hpo rf complete 0.83696 0.83696 submission.csv 2022-10-25 00:21:57 new feats hpo gbrt complete 0.75869 0.75869 submission.csv 2022-10-25 00:18:26 new feats hpo gbrt error tail: write error: Broken pipe
0.529¶hyperparameter and hyperparameter_tune_kwargs arguments.# import different models from autogluon
from autogluon.tabular.models import LGBModel, CatBoostModel, XGBoostModel, RFModel, XTModel, TabularNeuralNetTorchModel, TabularNeuralNetMxnetModel, NNFastAiTabularModel
import autogluon.core as ag
lgbm = LGBModel()
catbm = CatBoostModel()
xgb = XGBoostModel()
rf = RFModel()
xt = XTModel()
nn_torch = TabularNeuralNetTorchModel()
nn_mxnet = TabularNeuralNetMxnetModel()
nn_fastai = NNFastAiTabularModel()
Warning: No name was specified for model, defaulting to class name: LGBModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/LGBModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/ Warning: No name was specified for model, defaulting to class name: CatBoostModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/CatBoostModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/ Warning: No name was specified for model, defaulting to class name: XGBoostModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/XGBoostModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/ Warning: No name was specified for model, defaulting to class name: RFModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/RFModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/ Warning: No name was specified for model, defaulting to class name: XTModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/XTModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/ Warning: No name was specified for model, defaulting to class name: TabularNeuralNetTorchModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/TabularNeuralNetTorchModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/ Warning: No name was specified for model, defaulting to class name: TabularNeuralNetMxnetModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/TabularNeuralNetMxnetModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/ Warning: No name was specified for model, defaulting to class name: NNFastAiTabularModel No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033340/NNFastAiTabularModel/" Warning: No path was specified for model, defaulting to: AutogluonModels/ag-20221025_033340/
nn_options = { # specifies non-default hyperparameter values for neural network models
'num_epochs': 50, # number of training epochs (controls training time of NN models)
'learning_rate': ag.space.Real(3e-4, 1e-2, log=True), # learning rate used in training (real-valued hyperparameter searched on log-scale)
'activation': ag.space.Categorical('relu', 'tanh'), # activation function used in NN (categorical hyperparameter, default = first entry)
'dropout_prob': ag.space.Real(0.0, 0.5, default=0.1), # dropout probability (real-valued hyperparameter)
}
gbm_options = { # specifies non-default hyperparameter values for lightGBM gradient boosted trees
'learning_rate': ag.Real(3e-4, 0.3, log=True),
'num_boost_round': 1000, # number of boosting rounds (controls training time of GBM models)
'num_leaves': ag.space.Int(lower=26, upper=66, default=36), # number of leaves in trees (integer hyperparameter)
'max_depth': ag.space.Int(lower=3, upper=10, default=6)
}
xgbm_options = { # specifies non-default hyperparameter values for XGBoost gradient boosted trees
'n_estimators': 1000,
'learning_rate': ag.space.Real(1e-4, 3e-1, default=2e-2, log=True),
'colsample_bytree': ag.space.Real(1e-4, 1, default=5e-2, log=True),
'lambda': ag.space.Real(1e-4, 1, default=5e-2, log=True)
}
## Keys for the model types
# 'GBM': Light GBM,
# 'CAT': CatBoost,
# 'XGB': XGBoost,
# 'RF': Random Forest,
# 'XT': Extra Trees,
# 'NN_TORCH': PyTorch Neural Net,
# 'NN_MXNET': MXNet Neural Net,
# 'FASTAI': FastAI Neural Net
hyperparameters = { # hyperparameters of each model type
'NN_TORCH': nn_options,
'GBM': gbm_options,
'XGB': xgbm_options,
}
# Custom stack ensembling
# hyperparameters = {
# 1: {'GBM': gbm_options},
# 2: {'XGB': [xgbm_options], 'NN_TORCH': nn_options},
# 3: {"RF": {}}
# }
hyperparameter_tune_kwargs = { # HPO is not performed unless hyperparameter_tune_kwargs is specified
'num_trials': 5,
'scheduler' : "local",
'searcher': "auto"
}
predictor = TabularPredictor(label="count_transformed", eval_metric="root_mean_squared_error").fit(preprocessed_train_data,
time_limit=1200,
presets="best_quality",
auto_stack=True,
num_bag_folds=5,
num_bag_sets=2,
num_stack_levels=3,
hyperparameters=hyperparameters,
hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
verbosity=0
)
No path specified. Models will be saved in: "AutogluonModels/ag-20221025_033348/" 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 4.74657
60%|██████ | 3/5 [00:05<00:03, 1.75s/it]
[1000] valid_set's rmse: 2.89128
80%|████████ | 4/5 [00:06<00:01, 1.58s/it]
[1000] valid_set's rmse: 1.7995
100%|██████████| 5/5 [00:08<00:00, 1.75s/it] 100%|██████████| 5/5 [00:06<00:00, 1.34s/it] 2022-10-25 03:34:09,242 ERROR syncer.py:147 -- Log sync requires rsync to be installed. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 4.4008
60%|██████ | 3/5 [00:06<00:04, 2.00s/it]
[1000] valid_set's rmse: 1.9525
100%|██████████| 5/5 [00:11<00:00, 2.25s/it] 100%|██████████| 5/5 [00:11<00:00, 2.22s/it] NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 4.41347
60%|██████ | 3/5 [00:06<00:03, 2.00s/it]
[1000] valid_set's rmse: 1.97423
100%|██████████| 5/5 [00:10<00:00, 2.15s/it] 100%|██████████| 5/5 [00:09<00:00, 1.97s/it] NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 4.45331
0%| | 0/5 [00:03<?, ?it/s] 40%|████ | 2/5 [00:06<00:10, 3.39s/it] NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor.
predictor.fit_summary()
*** Summary of fit() ***
Estimated performance of each model:
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L3 -1.778244 7.022648 548.568486 0.001141 0.810898 3 True 26
1 XGBoost_BAG_L2/T3 -1.783042 6.390866 360.259035 0.285622 15.316813 2 True 21
2 LightGBM_BAG_L2/T3 -1.784718 6.249691 356.277861 0.144447 11.335639 2 True 16
3 WeightedEnsemble_L2 -1.784973 3.070834 264.599079 0.000950 0.698275 2 True 13
4 WeightedEnsemble_L4 -1.786558 11.019505 821.598609 0.001134 1.085356 4 True 39
5 XGBoost_BAG_L3/T3 -1.787126 10.458254 664.064200 0.228621 14.184485 3 True 34
6 LightGBM_BAG_L2/T5 -1.787787 6.412117 358.831490 0.306873 13.889267 2 True 18
7 LightGBM_BAG_L2/T2 -1.793227 6.200765 355.310230 0.095521 10.368008 2 True 15
8 WeightedEnsemble_L5 -1.794568 14.475055 1001.419703 0.000846 0.349421 5 True 45
9 XGBoost_BAG_L4/T3 -1.794605 14.230051 940.808287 0.310236 14.956454 4 True 43
10 LightGBM_BAG_L3/T3 -1.799348 10.373318 660.922389 0.143685 11.042674 3 True 29
11 LightGBM_BAG_L3/T5 -1.801689 10.454503 662.610704 0.224869 12.730988 3 True 31
12 LightGBM_BAG_L3/T2 -1.806713 10.328997 661.161135 0.099364 11.281420 3 True 28
13 LightGBM_BAG_L1/T5 -1.807358 1.319751 13.706730 1.319751 13.706730 1 True 5
14 LightGBM_BAG_L1/T3 -1.810727 1.068052 13.005809 1.068052 13.005809 1 True 3
15 XGBoost_BAG_L4/T1 -1.824299 14.334190 941.531194 0.414375 15.679361 4 True 41
16 XGBoost_BAG_L3/T1 -1.826169 10.585282 663.552703 0.355648 13.672988 3 True 32
17 NeuralNetTorch_BAG_L2/T2 -1.829284 6.285274 427.292903 0.180029 82.350681 2 True 25
18 NeuralNetTorch_BAG_L3/T2 -1.839217 10.394838 698.376677 0.165205 48.496962 3 True 38
19 NeuralNetTorch_BAG_L1/T2 -1.842914 0.172610 72.621161 0.172610 72.621161 1 True 12
20 NeuralNetTorch_BAG_L4/T1 -1.843181 14.163973 986.113829 0.244158 60.261995 4 True 44
21 LightGBM_BAG_L1/T2 -1.843772 0.364867 11.960453 0.364867 11.960453 1 True 2
22 NeuralNetTorch_BAG_L3/T1 -1.848099 10.525182 746.550387 0.295548 96.670671 3 True 37
23 XGBoost_BAG_L3/T4 -1.852206 10.393421 660.752949 0.163788 10.873233 3 True 35
24 NeuralNetTorch_BAG_L2/T1 -1.853336 6.315888 428.386448 0.210644 83.444226 2 True 24
25 XGBoost_BAG_L2/T1 -1.893179 6.490866 358.700746 0.385622 13.758523 2 True 19
26 XGBoost_BAG_L3/T5 -1.901856 10.346119 659.414380 0.116486 9.534665 3 True 36
27 XGBoost_BAG_L2/T4 -1.904067 6.278883 356.130288 0.173638 11.188066 2 True 22
28 LightGBM_BAG_L3/T4 -1.940244 10.946850 664.512906 0.717217 14.633191 3 True 30
29 LightGBM_BAG_L2/T4 -1.941501 6.958646 361.559075 0.853402 16.616853 2 True 17
30 XGBoost_BAG_L2/T5 -1.944034 6.301479 357.598343 0.196235 12.656121 2 True 23
31 NeuralNetTorch_BAG_L1/T1 -1.987288 0.509470 164.567105 0.509470 164.567105 1 True 11
32 XGBoost_BAG_L4/T2 -2.029927 14.078310 936.891913 0.158494 11.040080 4 True 42
33 XGBoost_BAG_L3/T2 -2.040494 10.383188 661.098322 0.153555 11.218607 3 True 33
34 XGBoost_BAG_L1/T3 -2.192204 0.197436 9.999094 0.197436 9.999094 1 True 8
35 XGBoost_BAG_L2/T2 -2.325588 6.298459 355.898842 0.193214 10.956619 2 True 20
36 XGBoost_BAG_L1/T5 -2.757694 0.105013 8.123122 0.105013 8.123122 1 True 10
37 XGBoost_BAG_L1/T4 -2.817106 0.097201 8.699144 0.097201 8.699144 1 True 9
38 LightGBM_BAG_L1/T4 -2.886574 0.855958 11.389135 0.855958 11.389135 1 True 4
39 XGBoost_BAG_L1/T1 -3.113564 0.179699 8.402470 0.179699 8.402470 1 True 6
40 LightGBM_BAG_L3/T1 -4.412642 11.255830 671.511950 1.026196 21.632235 3 True 27
41 LightGBM_BAG_L4/T1 -4.414217 14.965228 947.887919 1.045413 22.036085 4 True 40
42 LightGBM_BAG_L2/T1 -4.414685 7.204387 367.998902 1.099143 23.056679 2 True 14
43 LightGBM_BAG_L1/T1 -4.710417 1.101248 13.972327 1.101248 13.972327 1 True 1
44 XGBoost_BAG_L1/T2 -4.751995 0.133938 8.495673 0.133938 8.495673 1 True 7
Number of models trained: 45
Types of models trained:
{'WeightedEnsembleModel', 'StackerEnsembleModel_XGBoost', 'StackerEnsembleModel_LGB', 'StackerEnsembleModel_TabularNeuralNetTorch'}
Bagging used: True (with 5 folds)
Multi-layer stack-ensembling used: True (with 5 levels)
Feature Metadata (Processed):
(raw dtype, special dtypes):
('float', []) : 2 | ['humidity', 'windspeed_transformed']
('int', ['bool']) : 42 | ['holiday__0', 'holiday__1', 'workingday__0', 'workingday__1', 'weather_clear', ...]
Plot summary of models saved to file: AutogluonModels/ag-20221025_033348/SummaryOfModels.html
*** End of fit() summary ***
{'model_types': {'LightGBM_BAG_L1/T1': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L1/T2': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L1/T3': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L1/T4': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L1/T5': 'StackerEnsembleModel_LGB',
'XGBoost_BAG_L1/T1': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L1/T2': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L1/T3': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L1/T4': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L1/T5': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L1/T1': 'StackerEnsembleModel_TabularNeuralNetTorch',
'NeuralNetTorch_BAG_L1/T2': 'StackerEnsembleModel_TabularNeuralNetTorch',
'WeightedEnsemble_L2': 'WeightedEnsembleModel',
'LightGBM_BAG_L2/T1': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L2/T2': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L2/T3': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L2/T4': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L2/T5': 'StackerEnsembleModel_LGB',
'XGBoost_BAG_L2/T1': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L2/T2': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L2/T3': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L2/T4': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L2/T5': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L2/T1': 'StackerEnsembleModel_TabularNeuralNetTorch',
'NeuralNetTorch_BAG_L2/T2': 'StackerEnsembleModel_TabularNeuralNetTorch',
'WeightedEnsemble_L3': 'WeightedEnsembleModel',
'LightGBM_BAG_L3/T1': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L3/T2': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L3/T3': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L3/T4': 'StackerEnsembleModel_LGB',
'LightGBM_BAG_L3/T5': 'StackerEnsembleModel_LGB',
'XGBoost_BAG_L3/T1': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L3/T2': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L3/T3': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L3/T4': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L3/T5': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L3/T1': 'StackerEnsembleModel_TabularNeuralNetTorch',
'NeuralNetTorch_BAG_L3/T2': 'StackerEnsembleModel_TabularNeuralNetTorch',
'WeightedEnsemble_L4': 'WeightedEnsembleModel',
'LightGBM_BAG_L4/T1': 'StackerEnsembleModel_LGB',
'XGBoost_BAG_L4/T1': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L4/T2': 'StackerEnsembleModel_XGBoost',
'XGBoost_BAG_L4/T3': 'StackerEnsembleModel_XGBoost',
'NeuralNetTorch_BAG_L4/T1': 'StackerEnsembleModel_TabularNeuralNetTorch',
'WeightedEnsemble_L5': 'WeightedEnsembleModel'},
'model_performance': {'LightGBM_BAG_L1/T1': -4.710417164661144,
'LightGBM_BAG_L1/T2': -1.8437719704634155,
'LightGBM_BAG_L1/T3': -1.8107274838375766,
'LightGBM_BAG_L1/T4': -2.8865740161424616,
'LightGBM_BAG_L1/T5': -1.807357550701218,
'XGBoost_BAG_L1/T1': -3.113564155131339,
'XGBoost_BAG_L1/T2': -4.7519947083833785,
'XGBoost_BAG_L1/T3': -2.192204444375988,
'XGBoost_BAG_L1/T4': -2.8171058038801484,
'XGBoost_BAG_L1/T5': -2.757693515695107,
'NeuralNetTorch_BAG_L1/T1': -1.987288292704792,
'NeuralNetTorch_BAG_L1/T2': -1.8429143036558109,
'WeightedEnsemble_L2': -1.7849729834331916,
'LightGBM_BAG_L2/T1': -4.414684902554683,
'LightGBM_BAG_L2/T2': -1.793226531706533,
'LightGBM_BAG_L2/T3': -1.784718470735153,
'LightGBM_BAG_L2/T4': -1.941500558605507,
'LightGBM_BAG_L2/T5': -1.7877874404115763,
'XGBoost_BAG_L2/T1': -1.8931791022497226,
'XGBoost_BAG_L2/T2': -2.3255876450543047,
'XGBoost_BAG_L2/T3': -1.7830418362551668,
'XGBoost_BAG_L2/T4': -1.904066914021902,
'XGBoost_BAG_L2/T5': -1.944033714260512,
'NeuralNetTorch_BAG_L2/T1': -1.8533362826531212,
'NeuralNetTorch_BAG_L2/T2': -1.8292835839108086,
'WeightedEnsemble_L3': -1.7782444509454844,
'LightGBM_BAG_L3/T1': -4.412642144529005,
'LightGBM_BAG_L3/T2': -1.8067130919405237,
'LightGBM_BAG_L3/T3': -1.7993482512247272,
'LightGBM_BAG_L3/T4': -1.9402443102339848,
'LightGBM_BAG_L3/T5': -1.8016894675846666,
'XGBoost_BAG_L3/T1': -1.826169086380875,
'XGBoost_BAG_L3/T2': -2.0404942036562366,
'XGBoost_BAG_L3/T3': -1.7871258812500348,
'XGBoost_BAG_L3/T4': -1.8522060254467485,
'XGBoost_BAG_L3/T5': -1.901855913183338,
'NeuralNetTorch_BAG_L3/T1': -1.8480991018677475,
'NeuralNetTorch_BAG_L3/T2': -1.8392173238240703,
'WeightedEnsemble_L4': -1.7865582026026012,
'LightGBM_BAG_L4/T1': -4.414217497654231,
'XGBoost_BAG_L4/T1': -1.8242987471818275,
'XGBoost_BAG_L4/T2': -2.0299272060419877,
'XGBoost_BAG_L4/T3': -1.7946051997997423,
'NeuralNetTorch_BAG_L4/T1': -1.8431810582691068,
'WeightedEnsemble_L5': -1.7945684084456983},
'model_best': 'WeightedEnsemble_L3',
'model_paths': {'LightGBM_BAG_L1/T1': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L1/T1/',
'LightGBM_BAG_L1/T2': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L1/T2/',
'LightGBM_BAG_L1/T3': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L1/T3/',
'LightGBM_BAG_L1/T4': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L1/T4/',
'LightGBM_BAG_L1/T5': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L1/T5/',
'XGBoost_BAG_L1/T1': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L1/T1/',
'XGBoost_BAG_L1/T2': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L1/T2/',
'XGBoost_BAG_L1/T3': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L1/T3/',
'XGBoost_BAG_L1/T4': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L1/T4/',
'XGBoost_BAG_L1/T5': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L1/T5/',
'NeuralNetTorch_BAG_L1/T1': 'AutogluonModels/ag-20221025_033348/models/NeuralNetTorch_BAG_L1/T1/',
'NeuralNetTorch_BAG_L1/T2': 'AutogluonModels/ag-20221025_033348/models/NeuralNetTorch_BAG_L1/T2/',
'WeightedEnsemble_L2': 'AutogluonModels/ag-20221025_033348/models/WeightedEnsemble_L2/',
'LightGBM_BAG_L2/T1': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L2/T1/',
'LightGBM_BAG_L2/T2': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L2/T2/',
'LightGBM_BAG_L2/T3': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L2/T3/',
'LightGBM_BAG_L2/T4': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L2/T4/',
'LightGBM_BAG_L2/T5': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L2/T5/',
'XGBoost_BAG_L2/T1': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L2/T1/',
'XGBoost_BAG_L2/T2': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L2/T2/',
'XGBoost_BAG_L2/T3': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L2/T3/',
'XGBoost_BAG_L2/T4': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L2/T4/',
'XGBoost_BAG_L2/T5': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L2/T5/',
'NeuralNetTorch_BAG_L2/T1': 'AutogluonModels/ag-20221025_033348/models/NeuralNetTorch_BAG_L2/T1/',
'NeuralNetTorch_BAG_L2/T2': 'AutogluonModels/ag-20221025_033348/models/NeuralNetTorch_BAG_L2/T2/',
'WeightedEnsemble_L3': 'AutogluonModels/ag-20221025_033348/models/WeightedEnsemble_L3/',
'LightGBM_BAG_L3/T1': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L3/T1/',
'LightGBM_BAG_L3/T2': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L3/T2/',
'LightGBM_BAG_L3/T3': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L3/T3/',
'LightGBM_BAG_L3/T4': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L3/T4/',
'LightGBM_BAG_L3/T5': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L3/T5/',
'XGBoost_BAG_L3/T1': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L3/T1/',
'XGBoost_BAG_L3/T2': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L3/T2/',
'XGBoost_BAG_L3/T3': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L3/T3/',
'XGBoost_BAG_L3/T4': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L3/T4/',
'XGBoost_BAG_L3/T5': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L3/T5/',
'NeuralNetTorch_BAG_L3/T1': 'AutogluonModels/ag-20221025_033348/models/NeuralNetTorch_BAG_L3/T1/',
'NeuralNetTorch_BAG_L3/T2': 'AutogluonModels/ag-20221025_033348/models/NeuralNetTorch_BAG_L3/T2/',
'WeightedEnsemble_L4': 'AutogluonModels/ag-20221025_033348/models/WeightedEnsemble_L4/',
'LightGBM_BAG_L4/T1': 'AutogluonModels/ag-20221025_033348/models/LightGBM_BAG_L4/T1/',
'XGBoost_BAG_L4/T1': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L4/T1/',
'XGBoost_BAG_L4/T2': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L4/T2/',
'XGBoost_BAG_L4/T3': 'AutogluonModels/ag-20221025_033348/models/XGBoost_BAG_L4/T3/',
'NeuralNetTorch_BAG_L4/T1': 'AutogluonModels/ag-20221025_033348/models/NeuralNetTorch_BAG_L4/T1/',
'WeightedEnsemble_L5': 'AutogluonModels/ag-20221025_033348/models/WeightedEnsemble_L5/'},
'model_fit_times': {'LightGBM_BAG_L1/T1': 13.972326517105103,
'LightGBM_BAG_L1/T2': 11.960453033447266,
'LightGBM_BAG_L1/T3': 13.00580883026123,
'LightGBM_BAG_L1/T4': 11.389135122299194,
'LightGBM_BAG_L1/T5': 13.706729650497437,
'XGBoost_BAG_L1/T1': 8.402469635009766,
'XGBoost_BAG_L1/T2': 8.495672941207886,
'XGBoost_BAG_L1/T3': 9.999094486236572,
'XGBoost_BAG_L1/T4': 8.69914436340332,
'XGBoost_BAG_L1/T5': 8.123121738433838,
'NeuralNetTorch_BAG_L1/T1': 164.5671045780182,
'NeuralNetTorch_BAG_L1/T2': 72.62116146087646,
'WeightedEnsemble_L2': 0.6982748508453369,
'LightGBM_BAG_L2/T1': 23.056679248809814,
'LightGBM_BAG_L2/T2': 10.368008136749268,
'LightGBM_BAG_L2/T3': 11.335638523101807,
'LightGBM_BAG_L2/T4': 16.61685276031494,
'LightGBM_BAG_L2/T5': 13.889267206192017,
'XGBoost_BAG_L2/T1': 13.758523225784302,
'XGBoost_BAG_L2/T2': 10.956619262695312,
'XGBoost_BAG_L2/T3': 15.316812515258789,
'XGBoost_BAG_L2/T4': 11.188065528869629,
'XGBoost_BAG_L2/T5': 12.656120538711548,
'NeuralNetTorch_BAG_L2/T1': 83.44422554969788,
'NeuralNetTorch_BAG_L2/T2': 82.3506805896759,
'WeightedEnsemble_L3': 0.8108980655670166,
'LightGBM_BAG_L3/T1': 21.632235050201416,
'LightGBM_BAG_L3/T2': 11.28141975402832,
'LightGBM_BAG_L3/T3': 11.042673826217651,
'LightGBM_BAG_L3/T4': 14.633190631866455,
'LightGBM_BAG_L3/T5': 12.730988264083862,
'XGBoost_BAG_L3/T1': 13.672987937927246,
'XGBoost_BAG_L3/T2': 11.21860671043396,
'XGBoost_BAG_L3/T3': 14.184484958648682,
'XGBoost_BAG_L3/T4': 10.873233318328857,
'XGBoost_BAG_L3/T5': 9.534664630889893,
'NeuralNetTorch_BAG_L3/T1': 96.67067122459412,
'NeuralNetTorch_BAG_L3/T2': 48.49696183204651,
'WeightedEnsemble_L4': 1.085355520248413,
'LightGBM_BAG_L4/T1': 22.03608536720276,
'XGBoost_BAG_L4/T1': 15.67936086654663,
'XGBoost_BAG_L4/T2': 11.040079593658447,
'XGBoost_BAG_L4/T3': 14.956453800201416,
'NeuralNetTorch_BAG_L4/T1': 60.26199507713318,
'WeightedEnsemble_L5': 0.34942054748535156},
'model_pred_times': {'LightGBM_BAG_L1/T1': 1.1012475490570068,
'LightGBM_BAG_L1/T2': 0.3648674488067627,
'LightGBM_BAG_L1/T3': 1.0680522918701172,
'LightGBM_BAG_L1/T4': 0.8559582233428955,
'LightGBM_BAG_L1/T5': 1.3197510242462158,
'XGBoost_BAG_L1/T1': 0.17969942092895508,
'XGBoost_BAG_L1/T2': 0.13393759727478027,
'XGBoost_BAG_L1/T3': 0.19743633270263672,
'XGBoost_BAG_L1/T4': 0.09720110893249512,
'XGBoost_BAG_L1/T5': 0.10501289367675781,
'NeuralNetTorch_BAG_L1/T1': 0.5094702243804932,
'NeuralNetTorch_BAG_L1/T2': 0.17261028289794922,
'WeightedEnsemble_L2': 0.000949859619140625,
'LightGBM_BAG_L2/T1': 1.0991427898406982,
'LightGBM_BAG_L2/T2': 0.0955207347869873,
'LightGBM_BAG_L2/T3': 0.14444708824157715,
'LightGBM_BAG_L2/T4': 0.8534018993377686,
'LightGBM_BAG_L2/T5': 0.3068726062774658,
'XGBoost_BAG_L2/T1': 0.3856215476989746,
'XGBoost_BAG_L2/T2': 0.19321417808532715,
'XGBoost_BAG_L2/T3': 0.28562164306640625,
'XGBoost_BAG_L2/T4': 0.17363834381103516,
'XGBoost_BAG_L2/T5': 0.19623494148254395,
'NeuralNetTorch_BAG_L2/T1': 0.21064400672912598,
'NeuralNetTorch_BAG_L2/T2': 0.18002915382385254,
'WeightedEnsemble_L3': 0.0011408329010009766,
'LightGBM_BAG_L3/T1': 1.0261964797973633,
'LightGBM_BAG_L3/T2': 0.09936356544494629,
'LightGBM_BAG_L3/T3': 0.14368510246276855,
'LightGBM_BAG_L3/T4': 0.717216968536377,
'LightGBM_BAG_L3/T5': 0.2248694896697998,
'XGBoost_BAG_L3/T1': 0.3556482791900635,
'XGBoost_BAG_L3/T2': 0.15355491638183594,
'XGBoost_BAG_L3/T3': 0.2286205291748047,
'XGBoost_BAG_L3/T4': 0.1637876033782959,
'XGBoost_BAG_L3/T5': 0.1164860725402832,
'NeuralNetTorch_BAG_L3/T1': 0.2955484390258789,
'NeuralNetTorch_BAG_L3/T2': 0.16520476341247559,
'WeightedEnsemble_L4': 0.0011341571807861328,
'LightGBM_BAG_L4/T1': 1.0454127788543701,
'XGBoost_BAG_L4/T1': 0.41437458992004395,
'XGBoost_BAG_L4/T2': 0.1584944725036621,
'XGBoost_BAG_L4/T3': 0.31023573875427246,
'NeuralNetTorch_BAG_L4/T1': 0.2441577911376953,
'WeightedEnsemble_L5': 0.0008459091186523438},
'num_bag_folds': 5,
'max_stack_level': 5,
'model_hyperparams': {'LightGBM_BAG_L1/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L1/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L1/T3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L1/T4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L1/T5': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L1/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L1/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L1/T3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L1/T4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L1/T5': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L1/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L1/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L2': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L2/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L2/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L2/T3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L2/T4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L2/T5': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L2/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L2/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L2/T3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L2/T4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L2/T5': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L2/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L2/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L3': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L3/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L3/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L3/T3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L3/T4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L3/T5': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L3/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L3/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L3/T3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L3/T4': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L3/T5': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L3/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L3/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L4': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'LightGBM_BAG_L4/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L4/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L4/T2': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'XGBoost_BAG_L4/T3': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'NeuralNetTorch_BAG_L4/T1': {'use_orig_features': True,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True},
'WeightedEnsemble_L5': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True}},
'leaderboard': model score_val pred_time_val fit_time \
0 WeightedEnsemble_L3 -1.778244 7.022648 548.568486
1 XGBoost_BAG_L2/T3 -1.783042 6.390866 360.259035
2 LightGBM_BAG_L2/T3 -1.784718 6.249691 356.277861
3 WeightedEnsemble_L2 -1.784973 3.070834 264.599079
4 WeightedEnsemble_L4 -1.786558 11.019505 821.598609
5 XGBoost_BAG_L3/T3 -1.787126 10.458254 664.064200
6 LightGBM_BAG_L2/T5 -1.787787 6.412117 358.831490
7 LightGBM_BAG_L2/T2 -1.793227 6.200765 355.310230
8 WeightedEnsemble_L5 -1.794568 14.475055 1001.419703
9 XGBoost_BAG_L4/T3 -1.794605 14.230051 940.808287
10 LightGBM_BAG_L3/T3 -1.799348 10.373318 660.922389
11 LightGBM_BAG_L3/T5 -1.801689 10.454503 662.610704
12 LightGBM_BAG_L3/T2 -1.806713 10.328997 661.161135
13 LightGBM_BAG_L1/T5 -1.807358 1.319751 13.706730
14 LightGBM_BAG_L1/T3 -1.810727 1.068052 13.005809
15 XGBoost_BAG_L4/T1 -1.824299 14.334190 941.531194
16 XGBoost_BAG_L3/T1 -1.826169 10.585282 663.552703
17 NeuralNetTorch_BAG_L2/T2 -1.829284 6.285274 427.292903
18 NeuralNetTorch_BAG_L3/T2 -1.839217 10.394838 698.376677
19 NeuralNetTorch_BAG_L1/T2 -1.842914 0.172610 72.621161
20 NeuralNetTorch_BAG_L4/T1 -1.843181 14.163973 986.113829
21 LightGBM_BAG_L1/T2 -1.843772 0.364867 11.960453
22 NeuralNetTorch_BAG_L3/T1 -1.848099 10.525182 746.550387
23 XGBoost_BAG_L3/T4 -1.852206 10.393421 660.752949
24 NeuralNetTorch_BAG_L2/T1 -1.853336 6.315888 428.386448
25 XGBoost_BAG_L2/T1 -1.893179 6.490866 358.700746
26 XGBoost_BAG_L3/T5 -1.901856 10.346119 659.414380
27 XGBoost_BAG_L2/T4 -1.904067 6.278883 356.130288
28 LightGBM_BAG_L3/T4 -1.940244 10.946850 664.512906
29 LightGBM_BAG_L2/T4 -1.941501 6.958646 361.559075
30 XGBoost_BAG_L2/T5 -1.944034 6.301479 357.598343
31 NeuralNetTorch_BAG_L1/T1 -1.987288 0.509470 164.567105
32 XGBoost_BAG_L4/T2 -2.029927 14.078310 936.891913
33 XGBoost_BAG_L3/T2 -2.040494 10.383188 661.098322
34 XGBoost_BAG_L1/T3 -2.192204 0.197436 9.999094
35 XGBoost_BAG_L2/T2 -2.325588 6.298459 355.898842
36 XGBoost_BAG_L1/T5 -2.757694 0.105013 8.123122
37 XGBoost_BAG_L1/T4 -2.817106 0.097201 8.699144
38 LightGBM_BAG_L1/T4 -2.886574 0.855958 11.389135
39 XGBoost_BAG_L1/T1 -3.113564 0.179699 8.402470
40 LightGBM_BAG_L3/T1 -4.412642 11.255830 671.511950
41 LightGBM_BAG_L4/T1 -4.414217 14.965228 947.887919
42 LightGBM_BAG_L2/T1 -4.414685 7.204387 367.998902
43 LightGBM_BAG_L1/T1 -4.710417 1.101248 13.972327
44 XGBoost_BAG_L1/T2 -4.751995 0.133938 8.495673
pred_time_val_marginal fit_time_marginal stack_level can_infer \
0 0.001141 0.810898 3 True
1 0.285622 15.316813 2 True
2 0.144447 11.335639 2 True
3 0.000950 0.698275 2 True
4 0.001134 1.085356 4 True
5 0.228621 14.184485 3 True
6 0.306873 13.889267 2 True
7 0.095521 10.368008 2 True
8 0.000846 0.349421 5 True
9 0.310236 14.956454 4 True
10 0.143685 11.042674 3 True
11 0.224869 12.730988 3 True
12 0.099364 11.281420 3 True
13 1.319751 13.706730 1 True
14 1.068052 13.005809 1 True
15 0.414375 15.679361 4 True
16 0.355648 13.672988 3 True
17 0.180029 82.350681 2 True
18 0.165205 48.496962 3 True
19 0.172610 72.621161 1 True
20 0.244158 60.261995 4 True
21 0.364867 11.960453 1 True
22 0.295548 96.670671 3 True
23 0.163788 10.873233 3 True
24 0.210644 83.444226 2 True
25 0.385622 13.758523 2 True
26 0.116486 9.534665 3 True
27 0.173638 11.188066 2 True
28 0.717217 14.633191 3 True
29 0.853402 16.616853 2 True
30 0.196235 12.656121 2 True
31 0.509470 164.567105 1 True
32 0.158494 11.040080 4 True
33 0.153555 11.218607 3 True
34 0.197436 9.999094 1 True
35 0.193214 10.956619 2 True
36 0.105013 8.123122 1 True
37 0.097201 8.699144 1 True
38 0.855958 11.389135 1 True
39 0.179699 8.402470 1 True
40 1.026196 21.632235 3 True
41 1.045413 22.036085 4 True
42 1.099143 23.056679 2 True
43 1.101248 13.972327 1 True
44 0.133938 8.495673 1 True
fit_order
0 26
1 21
2 16
3 13
4 39
5 34
6 18
7 15
8 45
9 43
10 29
11 31
12 28
13 5
14 3
15 41
16 32
17 25
18 38
19 12
20 44
21 2
22 37
23 35
24 24
25 19
26 36
27 22
28 30
29 17
30 23
31 11
32 42
33 33
34 8
35 20
36 10
37 9
38 4
39 6
40 27
41 40
42 14
43 1
44 7 }
train_preds = predictor.predict(preprocessed_train_data.iloc[:,:-1])
train_preds = inv_boxcox(train_preds, 0.32)
train_preds_log = np.log(train_preds+1)
train_counts_log = np.log(preprocessed_train_data.count_transformed+1)
train_errors = (np.sqrt(abs(train_counts_log - train_preds_log))).mean()
print(f"training_RMSLE: {train_errors}")
training_RMSLE: 1.4667059728437757
predictor.evaluate(preprocessed_train_data)
{'root_mean_squared_error': -1.6096360875224123,
'mean_squared_error': -2.590928334254459,
'mean_absolute_error': -1.2651303562164757,
'r2': 0.9211630307049317,
'pearsonr': 0.9598795144791916,
'median_absolute_error': -1.0393269842290125}
predictions = predictor.predict(preprocessed_test_data)
predictions.head()
0 3.903300 1 1.349719 2 1.349719 3 1.351036 4 1.351036 Name: count_transformed, dtype: float32
# Describe the `predictions` series to see if there are any negative values
predictions.describe()
count 6493.000000 mean 11.590753 std 5.398349 min 1.224947 25% 7.556004 50% 12.249469 75% 15.754378 max 22.228214 Name: count_transformed, dtype: float64
predictions_final = inv_boxcox(predictions, 0.32)
predictions_final.describe()
count 6493.000000 mean 182.379761 std 163.481476 min 2.810969 25% 46.559299 50% 145.326248 75% 276.092896 max 693.740295 Name: count_transformed, dtype: float64
submission["count"] = predictions_final
submission.to_csv("submission.csv", index=False)
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "new features with hyperparameters new stack"
100%|█████████████████████████████████████████| 188k/188k [00:00<00:00, 365kB/s] Successfully submitted to Bike Sharing Demand
!kaggle competitions submissions -c bike-sharing-demand | tail -n +1 | head -n 100
fileName date description status publicScore privateScore -------------- ------------------- --------------------------------------------------- -------- ----------- ------------ submission.csv 2022-10-25 03:54:35 new features with hyperparameters new stack complete 0.52384 0.52384 submission.csv 2022-10-25 03:33:32 FeatEng with wind,count transformations no hyperopt complete 0.52900 0.52900 submission.csv 2022-10-25 02:52:44 first raw submission complete 1.76328 1.76328 submission.csv 2022-10-25 02:20:30 first raw submission complete 1.79318 1.79318 submission.csv 2022-10-25 01:08:27 new feats hpo log count complete 0.51762 0.51762 submission.csv 2022-10-25 01:07:45 new feats hpo log count complete 0.83696 0.83696 submission.csv 2022-10-25 00:43:35 new feats hpo rf complete 0.83696 0.83696 submission.csv 2022-10-25 00:21:57 new feats hpo gbrt complete 0.75869 0.75869 submission.csv 2022-10-25 00:18:26 new feats hpo gbrt error submission.csv 2022-10-25 00:18:16 new feats hpo gbrt error submission.csv 2022-10-25 00:17:32 new feats hpo gbrt complete 0.52983 0.52983 submission.csv 2022-10-24 23:27:28 new features with hyperparameters new stack complete 0.52983 0.52983 submission.csv 2022-10-24 11:38:44 new features with hyperparameters new stack complete 0.52804 0.52804 submission.csv 2022-10-24 11:23:58 new features with hyperparameters new stack complete 0.52902 0.52902 submission.csv 2022-10-24 11:02:22 new features with hyperparameters new stack complete 0.53233 0.53233 submission.csv 2022-10-24 10:42:58 new features with hyperparameters new stack complete 0.52854 0.52854 submission.csv 2022-10-24 10:12:46 new features with hyperparameters new stack complete 0.52530 0.52530 submission.csv 2022-10-24 09:48:59 new features with hyperparameters new stack complete 0.53233 0.53233 submission.csv 2022-10-24 09:46:37 new features with hyperparameters new stack complete 0.53233 0.53233 submission.csv 2022-10-24 09:27:51 new features with hyperparameters new stack complete 0.53288 0.53288 submission.csv 2022-10-24 09:12:28 new features with hyperparameters new stack complete 0.53473 0.53473 submission.csv 2022-10-24 08:59:18 new features with hyperparameters new stack complete 0.53278 0.53278 submission.csv 2022-10-22 05:10:04 new features with hyperparameters complete 0.52596 0.52596 submission.csv 2022-10-22 02:33:18 FeatEng with wind,count transformations no hyperopt complete 0.52900 0.52900 submission.csv 2022-10-20 10:02:48 FeatEng with wind,count transformations no hyperopt complete 0.52900 0.52900 submission.csv 2022-10-20 10:02:23 random test complete 0.54915 0.54915 submission.csv 2022-10-20 10:00:08 FeatEng with wind,count transformations no hyperopt complete 0.52900 0.52900 submission.csv 2022-10-19 16:39:39 full feature engineering no hyperopt complete 0.56222 0.56222 submission.csv 2022-10-19 16:38:53 random test complete 1.17310 1.17310 submission.csv 2022-10-19 16:37:17 first raw submission complete 0.56222 0.56222 submission.csv 2022-10-19 16:35:01 first raw submission complete 1.71016 1.71016 submission.csv 2022-10-17 11:11:10 first raw submission complete 1.71016 1.71016 submission.csv 2022-10-16 12:34:51 first raw submission complete 1.83241 1.83241
0.52538¶
ln(count+1)¶Since the count variable is highly skewed, and the validation metric used on Kaggle is RMSLE (Root Mean Squared Log Error), it makes sense that we train our models on the natural log of the count variable ln(count + 1). This will allow the models to calculate the RMSE on the logs of the actual and predicted values, effectively allowing us to model on the metric of RMSLE.
First let us get rid of the BCT'd count values and replace it with the ln(count + 1) values. There is a +1 here as there are some count values which are 0, and ln requires all positive values.
This is fine however as it won't affect our metrics too much. RMSLE has a lower penalty for overestimated values, and a large penalty for underestimated values.
original_count_values = train['count'].values
log_count = np.log(original_count_values+1)
preprocessed_train_data.head()
| humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | ... | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | windspeed_transformed | count_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.81 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 2.833213 |
| 1 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 3.713572 |
| 2 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 3.496508 |
| 3 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 | 2.639057 |
| 4 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 | 0.693147 |
5 rows × 45 columns
# replace the count values with these new log count values
preprocessed_train_data.count_transformed = log_count
preprocessed_train_data.head()
| humidity | holiday__0 | holiday__1 | workingday__0 | workingday__1 | weather_clear | weather_bad | weather_cloudy | season__1 | season__2 | ... | time_0 | time_1 | time_2 | time_3 | time_4 | time_5 | time_6 | time_7 | windspeed_transformed | count_transformed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.81 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 2.833213 |
| 1 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 3.713572 |
| 2 | 0.80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.219418 | 3.496508 |
| 3 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 | 2.639057 |
| 4 | 0.75 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.232204 | 0.693147 |
5 rows × 45 columns
predictor = TabularPredictor(label="count_transformed", eval_metric="root_mean_squared_error").fit(preprocessed_train_data,
time_limit=1200,
presets="best_quality",
auto_stack=True,
num_bag_folds=5,
num_bag_sets=2,
num_stack_levels=3,
hyperparameters=hyperparameters,
hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
verbosity=0
)
No path specified. Models will be saved in: "AutogluonModels/ag-20221025_043224/" 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 1.15061
60%|██████ | 3/5 [00:05<00:03, 1.71s/it]
[1000] valid_set's rmse: 0.676333
80%|████████ | 4/5 [00:06<00:01, 1.58s/it]
[1000] valid_set's rmse: 0.471268
100%|██████████| 5/5 [00:08<00:00, 1.76s/it] 100%|██████████| 5/5 [00:06<00:00, 1.33s/it] NaN or Inf found in input tensor. 2022-10-25 04:33:03,722 INFO stopper.py:364 -- Reached timeout of 19.19292210053158 seconds. Stopping all trials. 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 1.07629
60%|██████ | 3/5 [00:07<00:04, 2.04s/it]
[1000] valid_set's rmse: 0.491639
100%|██████████| 5/5 [00:11<00:00, 2.31s/it] 100%|██████████| 5/5 [00:10<00:00, 2.16s/it] NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 1.10139
60%|██████ | 3/5 [00:07<00:04, 2.20s/it]
[1000] valid_set's rmse: 0.502794
100%|██████████| 5/5 [00:11<00:00, 2.38s/it] 100%|██████████| 5/5 [00:10<00:00, 2.11s/it] NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. 0%| | 0/5 [00:00<?, ?it/s]
[1000] valid_set's rmse: 1.10596
0%| | 0/5 [00:04<?, ?it/s] 40%|████ | 2/5 [00:06<00:10, 3.40s/it] NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor.
# predictions and target are both log values
train_preds_log = predictor.predict(preprocessed_train_data.iloc[:,:-1])
train_counts_log = preprocessed_train_data.count_transformed
train_errors = (np.sqrt(abs(train_counts_log - train_preds_log))).mean()
print(f"training_RMSLE: {train_errors}")
training_RMSLE: 0.4978055474901413
predictor.evaluate(preprocessed_train_data)
{'root_mean_squared_error': -0.41923692799495166,
'mean_squared_error': -0.17575960179464428,
'mean_absolute_error': -0.3063047142841683,
'r2': 0.9127598913619348,
'pearsonr': 0.9555067207868448,
'median_absolute_error': -0.22347962752124984}
predictions = predictor.predict(preprocessed_test_data)
predictions.head()
0 2.549858 1 1.355424 2 1.355424 3 1.354181 4 1.354181 Name: count_transformed, dtype: float32
# Describe the `predictions` series to see if there are any negative values
predictions.describe()
count 6493.000000 mean 4.581038 std 1.329285 min 1.250084 25% 3.795995 50% 4.954998 75% 5.622943 max 6.459423 Name: count_transformed, dtype: float64
Get the actual predictions back by taking the exponent and subtracting 1
predictions_final = np.exp(predictions) - 1
predictions_final.describe()
count 6493.000000 mean 178.648010 std 158.985306 min 2.490637 25% 43.522503 50% 140.882385 75% 275.702637 max 637.692444 Name: count_transformed, dtype: float64
submission["count"] = predictions_final
submission.to_csv("submission.csv", index=False)
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "new feats hpo log count"
100%|█████████████████████████████████████████| 188k/188k [00:00<00:00, 330kB/s] Successfully submitted to Bike Sharing Demand
!kaggle competitions submissions -c bike-sharing-demand | tail -n +1 | head -n 7
fileName date description status publicScore privateScore -------------- ------------------- --------------------------------------------------- -------- ----------- ------------ submission.csv 2022-10-25 04:53:19 new feats hpo log count complete 0.51749 0.51749 submission.csv 2022-10-25 04:24:57 new feats hpo log count complete 0.52761 0.52761 submission.csv 2022-10-25 03:54:35 new features with hyperparameters new stack complete 0.52384 0.52384 submission.csv 2022-10-25 03:33:32 FeatEng with wind,count transformations no hyperopt complete 0.52900 0.52900 submission.csv 2022-10-25 02:52:44 first raw submission complete 1.76328 1.76328 tail: write error: Broken pipe
0.517¶
# Taking the top model score from each training run and creating a line plot to show improvement
# You can create these in the notebook and save them to PNG or use some other tool (e.g. google sheets, excel)
fig = pd.DataFrame(
{
"model_test_scores": ["initial", "new features no hpo", "new features BCT w/ hpo", "log count w/ hpo"],
"score": [1.832, 0.56222, 0.525, 0.51762]
}
).plot(x="model_test_scores", y="score", figsize=(8, 6)).get_figure()
fig.savefig('model_train_score.png')
# Take the 3 kaggle scores and creating a line plot to show improvement
fig = pd.DataFrame(
{
"model_train_scores": ["initial", "new features no hpo", "new features BCT w/ hpo", "log count w/ hpo"],
"score": [0.602, 1.467, 1.463, 0.4983]
}
).plot(x="model_train_scores", y="score", figsize=(8, 6)).get_figure()
fig.savefig('model_test_score.png')